In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
df=pd.read_csv('/content/Salary_Dataset_with_Extra_Features.csv')

In [4]:
df.head(5)


Unnamed: 0,Rating,Company Name,Job Title,Salary,Salaries Reported,Location,Employment Status,Job Roles
0,3.8,Sasken,Android Developer,400000,3,Bangalore,Full Time,Android
1,4.5,Advanced Millennium Technologies,Android Developer,400000,3,Bangalore,Full Time,Android
2,4.0,Unacademy,Android Developer,1000000,3,Bangalore,Full Time,Android
3,3.8,SnapBizz Cloudtech,Android Developer,300000,3,Bangalore,Full Time,Android
4,4.4,Appoids Tech Solutions,Android Developer,600000,3,Bangalore,Full Time,Android


## Size of Data set

In [5]:
print(f'Rows: {df.shape[0]}, Columns: {df.shape[1]}')

Rows: 22770, Columns: 8


In [6]:
print(df.columns)


Index(['Rating', 'Company Name', 'Job Title', 'Salary', 'Salaries Reported',
       'Location', 'Employment Status', 'Job Roles'],
      dtype='object')


In [7]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22770 entries, 0 to 22769
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Rating             22770 non-null  float64
 1   Company Name       22770 non-null  object 
 2   Job Title          22770 non-null  object 
 3   Salary             22770 non-null  int64  
 4   Salaries Reported  22770 non-null  int64  
 5   Location           22770 non-null  object 
 6   Employment Status  22770 non-null  object 
 7   Job Roles          22770 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 1.4+ MB


In [8]:
df.describe()


Unnamed: 0,Rating,Salary,Salaries Reported
count,22770.0,22770.0,22770.0
mean,3.918213,695387.2,1.855775
std,0.519675,884399.0,6.823668
min,1.0,2112.0,1.0
25%,3.7,300000.0,1.0
50%,3.9,500000.0,1.0
75%,4.2,900000.0,1.0
max,5.0,90000000.0,361.0


# Data Visualization

In [9]:
# Select the "Job Roles" column from the dataframe and count the frequency of each value
job_roles = df['Job Roles'].value_counts()
# Create a bar chart using Plotly Express
fig = px.bar(y=job_roles.values, 
             x=job_roles.index, 
             color = job_roles.index,
             color_discrete_sequence=px.colors.sequential.PuBuGn,
             text=job_roles.values,
             title= 'Job roles distribution',
             template= 'plotly_dark',
             width=750, height=500)
# Update the layout of the chart to set the axis titles and font size
fig.update_layout(
    xaxis_title="Job Titles",
    yaxis_title="count",
    font = dict(size=17,family="Franklin Gothic"))
# Show the chart
fig.show()

In [10]:
# Count the number of occurrences of each value in the 'Employment Status' column and sort them in descending order
employment_status=df["Employment Status"].value_counts().sort_values(ascending=False)
# Create a pie chart using Plotly with the number of occurrences as values, the employment status as names, and a blue-green color scheme
fig = px.pie(values=employment_status.values, 
             names=employment_status.index, 
             color_discrete_sequence=px.colors.sequential.PuBu,
             title= 'Employment status distribution',template='plotly_dark',
             width=750, height=500)
# Customize the pie chart by adding labels with percentage and value, increasing the text font size, and changing the marker line width and color
fig.update_traces(textinfo='label+percent+value', textfont_size=14,
                  marker=dict(line=dict(color='#100000', width=0.2)))

fig.data[0].marker.line.width = 2
fig.data[0].marker.line.color='gray'
# Update the layout by increasing the font size and setting the font family
fig.update_layout(
    font=dict(size=20,family="Franklin Gothic"))
# Show the chart
fig.show()

In [11]:
# Get count of unique values in the Rating column
Rating = df['Rating'].value_counts()
# Create a bar chart with x-axis as unique values in Rating column, y-axis as their count, 
# and color of the bars based on the unique values
fig = px.bar(y=Rating.values, 
             x=Rating.index, 
             color = Rating.index,
             color_discrete_sequence=px.colors.sequential.PuBuGn,
             text=Rating.values,
             title= 'Rating distribution',
             template= 'plotly_dark',
             width=750, height=500)
# Set the x-axis label as 'Rating', y-axis label as 'count', and font family and size
fig.update_layout(
    xaxis_title="Rating",
    yaxis_title="count",
    font = dict(size=17,family="Franklin Gothic"))
# Show the chart
fig.show()

In [12]:
#Calculate the mean salary for each job role and round it off to the nearest integer
mean=pd.DataFrame(df.groupby("Job Roles")["Salary"].mean().round(0)).reset_index()
#Create a bar plot with job roles on the x-axis and mean salary on the y-axis
fig = px.bar(y=mean["Salary"], 
             x=mean["Job Roles"], 
             color = mean["Job Roles"],
             color_discrete_sequence=px.colors.sequential.PuBuGn,
             text=mean["Salary"],
             title= 'Job roles and average salary',
             template= 'plotly_dark',
             width=750, height=500)
#Add x-axis and y-axis labels and change font size and family
fig.update_layout(
    xaxis_title="Job Titles",
    yaxis_title="Rupee - ₹",
    font = dict(size=17,family="Franklin Gothic"))
# Show the chart
fig.show()

In [13]:
# Creating a histogram with job roles and employment ststus
fig=px.histogram(df,x="Job Roles",color="Employment Status",title="<b>Count of Employment Status on evry Job Roles",
                color_discrete_sequence=px.colors.qualitative.Vivid,
                width=750, height=500)
# Setting the layout 
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
# Show the chart
fig.show()

# Prepare the data for Training

In [14]:
# Import the necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset and drop the "Salary" column from the input features
X = df.drop("Salary",axis =1)

# Create a separate dataframe for the "Salary" column and apply MinMaxScaler to it
y = pd.DataFrame(df['Salary'])
scaler = MinMaxScaler()
y = scaler.fit_transform(y)
print (y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=72)

[[0.00442108]
 [0.00442108]
 [0.0110879 ]
 ...
 [0.00210992]
 [0.00330994]
 [0.00330994]]


# Feature selection and encoding categorical variables

In [15]:
# Feature selection
X_train = X_train[['Rating', 'Company Name', 'Job Title','Location', 'Employment Status', 'Job Roles']]
X_test = X_test[['Rating', 'Company Name', 'Job Title','Location', 'Employment Status', 'Job Roles']]
# one-hot encoding
labels_to_encode = ['Company Name', 'Job Title', 'Location','Employment Status', 'Job Roles']
for label in labels_to_encode:
    X_train = X_train.join(pd.get_dummies(X_train[label], prefix = label))
    X_train.drop(label, axis=1, inplace=True)
for label in labels_to_encode:
    X_test = X_test.join(pd.get_dummies(X_test[label], prefix = label))
    X_test.drop(label, axis=1, inplace=True)

# Train Model

In [16]:
from sklearn.linear_model import LinearRegression


In [17]:
reg = LinearRegression()


In [18]:
reg.fit(X_train, y_train)


In [19]:
score =reg.score(X_train, y_train)

In [20]:
print(' Train R-squared:', score)


 Train R-squared: 0.884294058282438
