In [53]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from joblib import dump

from dataprep import data_constructor
from dataprep import data_split_encode

In [29]:
df = pd.read_csv("../data/salary_cleaned.csv")
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's Degree,Software Engineer,5.0,90000.0
1,28.0,Female,Master's Degree,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's Degree,Sales Associate,7.0,60000.0
4,52.0,Male,Master's Degree,Director,20.0,200000.0


In [30]:
scaler = StandardScaler()
encoder = OneHotEncoder(categories="auto", drop="first", handle_unknown="ignore")

In [31]:
X_train, X_test, y_train, y_test = data_constructor(
    data=df,
    label="Salary",
    test_size=0.30,
    seed=42,
    scaler=scaler,
    encoder=encoder,
    scaler_path="../models/scaler_fitted.joblib",
    encoder_path="../models/encoder_fitted.joblib"
)

Enocder has been saved to path: ..\models\encoder_fitted.joblib
Scaler has been saved to path: ..\models\scaler_fitted.joblib


In [32]:
X_train.head()

Unnamed: 0,Age,Years of Experience,Gender_Male,Gender_Other,Education Level_High School,Education Level_Master's Degree,Education Level_PhD,Job Title_Accountant,Job Title_Administrative Assistant,Job Title_Back end Developer,...,Job Title_Supply Chain Manager,Job Title_Technical Recruiter,Job Title_Technical Support Specialist,Job Title_Technical Writer,Job Title_Training Specialist,Job Title_UX Designer,Job Title_UX Researcher,Job Title_VP of Finance,Job Title_VP of Operations,Job Title_Web Developer
1494,-0.995424,-1.187835,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,-0.633275,-0.612951,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1251,-1.11614,-1.044114,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
733,2.022483,2.69263,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
609,-0.633275,-0.612951,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
X_test.head()

Unnamed: 0,Age,Years of Experience,Gender_Male,Gender_Other,Education Level_High School,Education Level_Master's Degree,Education Level_PhD,Job Title_Accountant,Job Title_Administrative Assistant,Job Title_Back end Developer,...,Job Title_Supply Chain Manager,Job Title_Technical Recruiter,Job Title_Technical Support Specialist,Job Title_Technical Writer,Job Title_Training Specialist,Job Title_UX Designer,Job Title_UX Researcher,Job Title_VP of Finance,Job Title_VP of Operations,Job Title_Web Developer
1192,-0.753992,-1.187835,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
426,-0.753992,-0.325509,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1473,-1.478289,-1.331556,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
765,-0.753992,-0.756672,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
485,-1.357573,-1.187835,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [34]:
y_train

1494     55000.0
997     102828.0
1251     40000.0
733     186963.0
609     105000.0
          ...   
1130     65000.0
1294     47000.0
860     136285.0
1459     50000.0
1126     70000.0
Name: Salary, Length: 1244, dtype: float64

In [35]:
y_test

1192     30000.0
426     180000.0
1473     25000.0
765     104702.0
485      60000.0
          ...   
81       80000.0
584     120000.0
380     180000.0
1504     50000.0
84       40000.0
Name: Salary, Length: 534, dtype: float64

### Training various algorithms in base condition to get the r2score

#### Training linear regression model

In [36]:
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)
preds_linear = model_linear.predict(X_test)
r2_linear = r2_score(y_true=y_test, y_pred=preds_linear)
print(f"R2 score of linear regression model: {r2_linear}")

R2 score of linear regression model: -5.236852728971886e+22


#### Training kNeighborsRegressor model

In [37]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
r2_knn = r2_score(y_true=y_test, y_pred=knn_preds)
r2_knn

0.8604489641332803

Training SupposeVectorRegressor model

In [38]:
svr = SVR() # base model
svr.fit(X_train, y_train)
svr_preds = svr.predict(X_test)
r2_svr = r2_score(y_true=y_test, y_pred=svr_preds)
r2_svr

0.0020840675598708946

#### Training RandomForestRegressor model

In [39]:
rforest = RandomForestRegressor()
rforest.fit(X_train, y_train)
preds_rforest = rforest.predict(X_test)
r2_rforest = r2_score(y_true=y_test, y_pred=preds_rforest)
print(r2_rforest)

0.9107357889133025


**Conclusion:**  
The linear regression model is not fitting at all. Maybe it doesn't need standardization. Maybe it the numeric feature columns' units are not that far away from each other.

In [40]:
df2 = pd.read_csv("../data/salary_cleaned.csv")
df2.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's Degree,Software Engineer,5.0,90000.0
1,28.0,Female,Master's Degree,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's Degree,Sales Associate,7.0,60000.0
4,52.0,Male,Master's Degree,Director,20.0,200000.0


In [41]:
encoder = OneHotEncoder(categories="auto", drop="first", handle_unknown="ignore")

In [42]:
X_train, X_test, y_train, y_test = data_split_encode(
    data=df2,
    label="Salary",
    test_size=0.30,
    seed=42,
    encoder=encoder,
    encoder_path="../models/encoder_fittedv2.joblib"
)

Encode is saved to path: ..\models\encoder_fittedv2.joblib


#### Let's train the linear regression model on it and get the r2score

In [43]:
model_lin = LinearRegression()
model_lin.fit(X_train, y_train)
pred_lin = model_lin.predict(X_test)
r2_lin = r2_score(y_true=y_test, y_pred=pred_lin)
print(f"R2 score of linear regression model: {r2_lin}")

R2 score of linear regression model: -3.504775715741681e+20


**Conclusion:**  
Linear regression doesn't seem to fit this data at all. I don't know the implementation of RandomForestRegressor yet. So let's implement knn regressor.

In [44]:
df = pd.read_csv("../data/salary_cleaned.csv")
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's Degree,Software Engineer,5.0,90000.0
1,28.0,Female,Master's Degree,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's Degree,Sales Associate,7.0,60000.0
4,52.0,Male,Master's Degree,Director,20.0,200000.0


In [45]:
scaler = StandardScaler()
encoder = OneHotEncoder(categories="auto", drop="first", handle_unknown="ignore")

In [46]:
X_train, X_test, y_train, y_test = data_constructor(
    data=df,
    label="Salary",
    test_size=0.30,
    seed=42,
    scaler=scaler,
    encoder=encoder,
    scaler_path="../models/scaler_fitted.joblib",
    encoder_path="../models/encoder_fitted.joblib"
)

Enocder has been saved to path: ..\models\encoder_fitted.joblib
Scaler has been saved to path: ..\models\scaler_fitted.joblib


#### Setting up the GridSearch and training the model

In [47]:
knn_reg = KNeighborsRegressor()
knn_params = {
    "n_neighbors":[5, 7, 9, 11],
    "weights":["uniform", "distance"],
    "algorithm":["auto"],
    "metric":["minkowski", "euclidean", "manhattan"],
}

knn_tuned = GridSearchCV(
    estimator=knn_reg,
    param_grid=knn_params,
    scoring="neg_mean_absolute_error",
    n_jobs=4, verbose=1, cv=5
)

In [48]:
# Fitting the model
knn_tuned.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [49]:
knn_tuned.best_params_ # best combination of hyper params

{'algorithm': 'auto',
 'metric': 'manhattan',
 'n_neighbors': 5,
 'weights': 'distance'}

In [50]:
knn_tuned_preds = knn_tuned.predict(X_test)

In [52]:
knn_mae = mean_absolute_error(y_true=y_test, y_pred=knn_tuned_preds)
knn_rmse = root_mean_squared_error(y_true=y_test, y_pred=knn_tuned_preds)

print(f"Mean absolute error of knn regressor: {knn_mae}")
print(f"Root meean squared error of knn regressor: {knn_rmse}")

Mean absolute error of knn regressor: 11140.764282238744
Root meean squared error of knn regressor: 17647.460351285477


In [54]:
# Let's train a knn model, of same hyper params which will be trained on the entire dataset, and then after that we will save it
knn_final = KNeighborsRegressor(
    algorithm="auto",
    n_neighbors=5,
    weights="distance",
    metric="manhattan",
    n_jobs=4
)

In [55]:
# Fit the final knn model
knn_final.fit(X_train, y_train)

#### Saving this knn regressor model

In [56]:
dump(value=knn_final, filename="../models/knnreg.joblib")
print(f"Model has been saved!")

Model has been saved!
