In [173]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import r2_score,mean_absolute_error as mae,mean_squared_error as mse
from sklearn.linear_model import LinearRegression

In [174]:
# loading data 
train=pd.read_csv('Data_file.csv')
test=pd.read_csv('Test_data_file.csv')

In [175]:
# Looking at the first few rows
train.head()

Unnamed: 0,College,Role,City type,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC
0,Tier 1,Manager,Non-Metro,55523.0,3,66,19,71406.58
1,Tier 2,Executive,Metro,57081.0,1,84,18,68005.87
2,Tier 2,Executive,Metro,60347.0,2,52,28,76764.02
3,Tier 3,Executive,Metro,49010.0,2,81,33,82092.39
4,Tier 3,Executive,Metro,57879.0,4,74,32,73878.1


In [176]:
test.head()

Unnamed: 0,College,Role,City type,College_T1,College_T2,Role_Manager,City_Metro,previous CTC,previous job changes,Graduation marks,Exp,Actual CTC
0,Tier 1,Manager,Non-Metro,1,0,1,0,55523,3,66,19,71406.57653
1,Tier 2,Executive,Metro,0,1,0,1,57081,1,84,18,68005.87063
2,Tier 2,Executive,Metro,0,1,0,1,60347,2,52,28,76764.02028
3,Tier 3,Executive,Metro,0,0,0,1,49010,2,81,33,82092.38688
4,Tier 3,Executive,Metro,0,0,0,1,57879,4,74,32,73878.09773


In [177]:
# types of values are stored in the columns.
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   College               1338 non-null   object
 1   Role                  1338 non-null   object
 2   City type             1338 non-null   object
 3   Previous CTC          1338 non-null   object
 4   Previous job changes  1338 non-null   int64 
 5   Graduation marks      1338 non-null   int64 
 6   Exp (Months)          1338 non-null   int64 
 7   CTC                   1338 non-null   object
dtypes: int64(3), object(5)
memory usage: 83.8+ KB


In [178]:
# checking for null values
train.isnull().sum()

College                 0
Role                    0
City type               0
Previous CTC            0
Previous job changes    0
Graduation marks        0
Exp (Months)            0
CTC                     0
dtype: int64

In [179]:
# checking for duplicated rows
train.duplicated().sum()

0

In [180]:
# point summary of numerical featues
train.describe()

Unnamed: 0,Previous job changes,Graduation marks,Exp (Months)
count,1338.0,1338.0,1338.0
mean,2.525411,59.890882,39.207025
std,1.123502,14.894696,14.04996
min,1.0,35.0,18.0
25%,2.0,47.0,27.0
50%,3.0,60.0,39.0
75%,4.0,73.0,51.0
max,4.0,85.0,64.0


In [181]:
#converting categorical features into numerical
train=pd.get_dummies(columns=['College','Role','City type'],data=train)

In [182]:
train

Unnamed: 0,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC,College_Tier 1,College_Tier 2,College_Tier 3,Role_Executive,Role_Manager,City type_Metro,City type_Non-Metro
0,55523.00,3,66,19,71406.58,1,0,0,0,1,0,1
1,57081.00,1,84,18,68005.87,0,1,0,1,0,1,0
2,60347.00,2,52,28,76764.02,0,1,0,1,0,1,0
3,49010.00,2,81,33,82092.39,0,0,1,1,0,1,0
4,57879.00,4,74,32,73878.10,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,59661.00,4,68,50,69712.40,0,0,1,1,0,1,0
1334,53714.00,1,67,18,69298.75,1,0,0,1,0,0,1
1335,61957.00,1,47,18,66397.77,0,1,0,1,0,0,1
1336,53203.00,3,69,21,64044.38,1,0,0,1,0,0,1


In [183]:
#converting CTC into numerical
train['CTC']=train['CTC'].str.replace(',', '')
train['Previous CTC']=train['Previous CTC'].str.replace(',', '')

In [184]:
train['CTC']=train['CTC'].astype('float')
train['Previous CTC']=train['Previous CTC'].astype('float')

In [185]:
train.head(2)

Unnamed: 0,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC,College_Tier 1,College_Tier 2,College_Tier 3,Role_Executive,Role_Manager,City type_Metro,City type_Non-Metro
0,55523.0,3,66,19,71406.58,1,0,0,0,1,0,1
1,57081.0,1,84,18,68005.87,0,1,0,1,0,1,0


In [186]:
test.head(2)

Unnamed: 0,College,Role,City type,College_T1,College_T2,Role_Manager,City_Metro,previous CTC,previous job changes,Graduation marks,Exp,Actual CTC
0,Tier 1,Manager,Non-Metro,1,0,1,0,55523,3,66,19,71406.57653
1,Tier 2,Executive,Metro,0,1,0,1,57081,1,84,18,68005.87063


In [187]:
# fixing dummy variable issue in train dataset wrt to test data
drop_list = ['College_Tier 3', 'City type_Non-Metro', 'Role_Executive']
train.drop(drop_list, axis=1,inplace=True)

In [188]:
# droping unncessary variables in test dataset
drop_list2 = ['College', 'Role', 'City type']
test.drop(drop_list2, axis=1,inplace=True)

In [189]:
train.head(0)

Unnamed: 0,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC,College_Tier 1,College_Tier 2,Role_Manager,City type_Metro


In [190]:
test.head(0)

Unnamed: 0,College_T1,College_T2,Role_Manager,City_Metro,previous CTC,previous job changes,Graduation marks,Exp,Actual CTC


In [191]:
#Rearranging the test dataset columns
train=train[['College_Tier 1' , 'College_Tier 2', 'Role_Manager', 'City type_Metro', 'Previous CTC','Previous job changes','Graduation marks','Exp (Months)','CTC']]

In [192]:
#Rearranging the test dataset columns
test=test[['College_T1', 'College_T2','Role_Manager','City_Metro','previous CTC','previous job changes','Graduation marks','Exp', 'Actual CTC']]

In [193]:
train.head(0)

Unnamed: 0,College_Tier 1,College_Tier 2,Role_Manager,City type_Metro,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC


In [194]:
test.head(0)

Unnamed: 0,College_T1,College_T2,Role_Manager,City_Metro,previous CTC,previous job changes,Graduation marks,Exp,Actual CTC


In [195]:
train.columns = ['College_T1', 'College_T2', 'Role_Manager', 'City_Metro', 'previous CTC', 'previous job changes', 'Graduation marks', 'Exp', 'Actual CTC']

In [196]:
train.head(0)

Unnamed: 0,College_T1,College_T2,Role_Manager,City_Metro,previous CTC,previous job changes,Graduation marks,Exp,Actual CTC


In [197]:
# Spliting the training dataset to fit the model
X_train=train.drop('Actual CTC',axis=1)
Y_train=train[['Actual CTC']]

In [198]:
# Spliting the test dataset to predict from the model
X_test=test.drop('Actual CTC',axis=1)
Y_test=test[['Actual CTC']]

In [199]:
# feature scalling ----Standrdization
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [200]:
X_train

array([[ 1.0303559 , -0.61132367,  1.97058663, ...,  0.42257741,
         0.4103073 , -1.43876426],
       [-0.97053843,  1.63579466, -0.5074631 , ..., -1.35823699,
         1.61924306, -1.50996545],
       [-0.97053843,  1.63579466, -0.5074631 , ..., -0.46782979,
        -0.52997607, -0.79795355],
       ...,
       [-0.97053843,  1.63579466, -0.5074631 , ..., -1.35823699,
        -0.86579156, -1.50996545],
       [ 1.0303559 , -0.61132367, -0.5074631 , ...,  0.42257741,
         0.61179659, -1.29636188],
       [-0.97053843, -0.61132367,  1.97058663, ..., -1.35823699,
        -0.86579156,  1.55168573]])

In [201]:
X_test

array([[ 1.0303559 , -0.61132367,  1.97058663, ...,  0.42257741,
         0.4103073 , -1.43876426],
       [-0.97053843,  1.63579466, -0.5074631 , ..., -1.35823699,
         1.61924306, -1.50996545],
       [-0.97053843,  1.63579466, -0.5074631 , ..., -0.46782979,
        -0.52997607, -0.79795355],
       ...,
       [-0.97053843,  1.63579466, -0.5074631 , ..., -1.35823699,
        -0.86579156, -1.50996545],
       [ 1.0303559 , -0.61132367, -0.5074631 , ...,  0.42257741,
         0.61179659, -1.29636188],
       [-0.97053843, -0.61132367,  1.97058663, ..., -1.35823699,
        -0.86579156,  1.55168573]])

#### LInear Regression

In [202]:
# model creation
LR_model=LinearRegression() # model creation
LR_model.fit(X_train, Y_train) # model fitting --training dataset

LinearRegression()

In [203]:
# prediction ----Test dataset
Y_pred=LR_model.predict(X_test)
Y_pred

array([[86123.93772412],
       [65789.54686289],
       [69941.69069909],
       ...,
       [64046.07494261],
       [66223.22214183],
       [90794.94080771]])

#### Evluation

In [204]:
# performance measure
print('R2 Score', r2_score(Y_test, Y_pred)*100)
print('MAE', mae(Y_test, Y_pred))
print('MSE', mse(Y_test, Y_pred))
print('RMSE', np.sqrt(mse(Y_test, Y_pred)))

R2 Score 60.81873160460819
MAE 6159.175550813233
MSE 61677937.04476763
RMSE 7853.530228169217
