Importing libraries. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plot
import seaborn as sb
from sklearn.metrics import mean_squared_error

Read CSV file to predict salary.

In [2]:
dataframe_emp = pd.read_csv('https://raw.githubusercontent.com/abhishah1608/DataSetRepository/Algorithm/details_emp.csv')
dataframe_emp.head()

Unnamed: 0,empId,name,gender,Job Title,MobileNo,email,Sin no,Working Year,Salary_USD,experience_level,employment_type,remote_ratio,employee_residence,Company_size,Category
0,1,John Smith,Male,Big Data Engineer,+1 817-649-0890,johnsmith34509@gmail.com,233 286 646,2020,109024,SE,FT,50,GB,M,C9
1,2,Jane Doe,Female,Big Data Engineer,+1 379-250-2876,janedoe96854@gmail.com,810 530 137,2020,114047,SE,FT,100,PL,S,C5
2,3,Bob Johnson,Male,Big Data Engineer,+1 844-440-0376,bobjohnson84252@gmail.com,852 020 065,2020,70000,EN,FT,100,US,L,C9
3,4,Susan Lee,Female,Big Data Engineer,+1 240-789-2655,susanlee93926@gmail.com,498 888 156,2021,60000,MI,FT,50,ES,M,C5
4,5,Chris Evans,Male,Big Data Engineer,+1 931-722-9414,chrisevans25804@gmail.com,118 785 074,2021,22611,MI,FT,0,IN,L,C9


Remove Unnecessary Columns.

In [3]:
dataframe_emp.drop('empId',axis= 1, inplace=True)
dataframe_emp.drop('name',axis= 1, inplace=True)
dataframe_emp.drop('MobileNo',axis= 1,inplace=True)
dataframe_emp.drop('email',axis= 1,inplace=True)
dataframe_emp.drop('Sin no',axis= 1, inplace=True)
dataframe_emp.drop('Category',axis= 1,inplace=True)
dataframe_emp.head()

Unnamed: 0,gender,Job Title,Working Year,Salary_USD,experience_level,employment_type,remote_ratio,employee_residence,Company_size
0,Male,Big Data Engineer,2020,109024,SE,FT,50,GB,M
1,Female,Big Data Engineer,2020,114047,SE,FT,100,PL,S
2,Male,Big Data Engineer,2020,70000,EN,FT,100,US,L
3,Female,Big Data Engineer,2021,60000,MI,FT,50,ES,M
4,Male,Big Data Engineer,2021,22611,MI,FT,0,IN,L


Use get_dummies to convert to Numerical Data.

In [4]:
final_dataframe = pd.get_dummies(data=dataframe_emp, columns=['gender','Job Title','experience_level','employment_type','remote_ratio','employee_residence','Company_size'], drop_first=True)
final_dataframe.head()

Unnamed: 0,Working Year,Salary_USD,gender_Male,Job Title_Data Analyst,Job Title_Data Architect,Job Title_Data Scientist,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_PT,...,employee_residence_MX,employee_residence_NG,employee_residence_PK,employee_residence_PL,employee_residence_TR,employee_residence_UA,employee_residence_US,employee_residence_VN,Company_size_M,Company_size_S
0,2020,109024,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,2020,114047,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
2,2020,70000,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2021,60000,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2021,22611,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Adding a Correlation.

In [5]:
corr_matrix = final_dataframe.corr()
# we need to predict for the Salary_USD, as Salary_USD is our target column so we need to check which columns are directly associated with Salary_USD in descending order 
# that affects the G3.
corrlist = corr_matrix["Salary_USD"].sort_values(ascending=False)
corrlist

Salary_USD                  1.000000
employee_residence_US       0.594509
experience_level_SE         0.496034
Job Title_Data Architect    0.343075
Working Year                0.318016
remote_ratio_100            0.279238
Company_size_M              0.227110
gender_Male                 0.094771
experience_level_EX         0.041690
employee_residence_IL       0.028245
employee_residence_PL       0.022053
employee_residence_CA       0.003782
employee_residence_BG      -0.020012
Job Title_Data Scientist   -0.037783
employee_residence_LU      -0.041354
Job Title_Data Analyst     -0.055922
employee_residence_CL      -0.069385
employee_residence_HU      -0.074702
employee_residence_IT      -0.092080
employee_residence_TR      -0.093931
employee_residence_MD      -0.096613
employee_residence_DE      -0.096997
employee_residence_UA      -0.102297
employee_residence_BR      -0.102913
employee_residence_PK      -0.108968
employee_residence_VN      -0.113910
employee_residence_GB      -0.113944
e

#  Split model into Train and test dataset.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
y = final_dataframe['Salary_USD']
X = final_dataframe.drop('Salary_USD',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26) # 80% training and 20% test


## Establish a baseline RMSE for your dataset using a naive predictive model, so that it can be used as a performance matrices such that those model having better RMSE than Naive predictive model are performing better models.

In [7]:
mean_y_train = np.mean(y_train)

class NaiveModel:
    def predict(self, X):
        return np.full(X.shape[0], mean_y_train)

naive_model = NaiveModel()
y_pred_naive = naive_model.predict(X_train)

rmse_naive = mean_squared_error(y_train, y_pred_naive, squared=False)

print("RMSE for naive model:", rmse_naive)


RMSE for naive model: 57479.25869953082


## Linear Regression Model.

In [8]:
reg = LinearRegression()
reg.fit(X_train, y_train)

reg.score(X_train, y_train)

y_predict = reg.predict(X_test)


print(f'RMSE for linear regression : {np.sqrt(mse(y_test,y_predict))}')


RMSE for linear regression : 36868.085856084734


Split the model in train and test dataset.

Implement Decision Tree regressor.

In [9]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

dtree = DecisionTreeRegressor(max_depth=2,random_state=26)
dtree.fit(X_train, y_train)

In [10]:
pred_train_tree= dtree.predict(X_train)

pred_test_tree= dtree.predict(X_test)
print("RMSE for Decision Tree Regressor",np.sqrt(mean_squared_error(y_test,pred_test_tree))) 
print(r2_score(y_test, pred_test_tree))

RMSE for Decision Tree Regressor 32909.88901311962
0.5817524263078562


Implement Esembler learning for Voting Regressor.

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

lr = LinearRegression()
rf = RandomForestRegressor(
    n_estimators=6, random_state=1,max_depth=3)
knn = KNeighborsRegressor()
xgbr = xgb.XGBRegressor(verbosity=0, random_state=26) 
vr = VotingRegressor([('linerar-Regression', lr), ('rf', rf), ('knn', knn),('xgb',xgbr)])


vr.fit(X_train, y_train.values.ravel())


print score for each model.

In [12]:
for name, v in vr.named_estimators_.items():
    print(name, "=", v.score(X_test, y_test))

linerar-Regression = 0.4750937120731761
rf = 0.6433023611077651
knn = 0.4527260613688161
xgb = 0.5216873676837956


print score of Voting Regressor.

In [13]:
vr.score(X_test, y_test)

0.5956530711066916

print Mean square error.

In [14]:
y_predictvr = vr.predict(X_test)
print("RMSE for Voting Regressor:",np.sqrt(mean_squared_error(y_test,y_predictvr))) 

RMSE for Voting Regressor: 32358.38045867697
