In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
df = pd.read_csv(r'eda-cleaned-data.csv')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,age of company,python_yn,R_yn,aws_yn,excel_yn,spark_yn,role_simp,seniority,desc_length,num_competitors
0,Data Scientist,53-91,"Data Scientist\r\nLocation: Albuquerque, NM\r\...",3.8,Tecolote Research\r\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,...,49,1,0,0,1,0,data scientist,na,337,-1
1,Healthcare Data Scientist,63-112,What You Will Do:\r\n\r\nI. General Summary\r\...,3.4,University of Maryland Medical System\r\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,...,38,1,0,0,0,0,data scientist,na,636,-1
2,Data Scientist,80-90,"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\r\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,...,12,1,0,0,1,1,data scientist,na,460,-1
3,Data Scientist,56-97,*Organization and Job ID**\r\nJob ID: 310709\r...,3.8,PNNL\r\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,...,57,1,0,0,0,0,data scientist,na,489,3
4,Data Scientist,86-143,Data Scientist\r\nAffinity Solutions / Marketi...,2.9,Affinity Solutions\r\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,...,24,1,0,0,1,0,data scientist,na,358,3


## Steps to do:
### 1.) Choose relevant columns for ml model
### 2.) Create dummy data for categorical variables
### 3.) Create train-test split (where train set will be used for validation of individual models and test set used for final evaluation of ensemble models)
### 4.) Model 1 - Multivariate linear regression
### 5.) Model 2 - Lasso Regression
### 6.) Model 3 - Random Forest Regressor (to compare linear model performance vs tree based model)
### 7.) Hyperpara tuningg using GridSearchCV
### 8.) Test Ensemble model performance on test set

# 1.) Choosing relevant data for models

In [7]:
df.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'Hourly', 'Employer Provided', 'min salary', 'max salary', 'avg salary',
       'Company Name Text', 'Job State', 'same state', 'age of company',
       'python_yn', 'R_yn', 'aws_yn', 'excel_yn', 'spark_yn', 'role_simp',
       'seniority', 'desc_length', 'num_competitors'],
      dtype='object')

In [9]:
df_model = df[['Rating','Size','Type of ownership','Industry','Sector','Revenue','num_competitors','Hourly','Employer Provided','Job State','same state','age of company','python_yn', 'R_yn', 'aws_yn', 'excel_yn', 'spark_yn', 'role_simp',
       'seniority', 'desc_length','avg salary']]

# 2.) Create one-hot encodings for Categorical data

In [10]:
df_dum = pd.get_dummies(df_model)
df_dum.head()

Unnamed: 0,Rating,num_competitors,Hourly,Employer Provided,same state,age of company,python_yn,R_yn,aws_yn,excel_yn,...,role_simp_analyst,role_simp_data engineer,role_simp_data scientist,role_simp_director,role_simp_manager,role_simp_mle,role_simp_na,seniority_junior,seniority_na,seniority_senior
0,3.8,-1,0,0,0,49,1,0,0,1,...,0,0,1,0,0,0,0,0,1,0
1,3.4,-1,0,0,0,38,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,4.8,-1,0,0,1,12,1,0,0,1,...,0,0,1,0,0,0,0,0,1,0
3,3.8,3,0,0,1,57,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,2.9,3,0,0,1,24,1,0,0,1,...,0,0,1,0,0,0,0,0,1,0


In [12]:
df_dum.columns

Index(['Rating', 'num_competitors', 'Hourly', 'Employer Provided',
       'same state', 'age of company', 'python_yn', 'R_yn', 'aws_yn',
       'excel_yn',
       ...
       'role_simp_analyst', 'role_simp_data engineer',
       'role_simp_data scientist', 'role_simp_director', 'role_simp_manager',
       'role_simp_mle', 'role_simp_na', 'seniority_junior', 'seniority_na',
       'seniority_senior'],
      dtype='object', length=179)

# 3.) Create train-test split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = df_dum.drop(['avg salary'],axis=1)
y = df_dum['avg salary'].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# 4.) Model1 - Linear Regression

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [34]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [35]:
np.mean(cross_val_score(lr,X_train,y_train,scoring='neg_mean_absolute_error',cv=3))

-3815124382.909467

# 5.) Model2 - Lasso Regression

In [44]:
from sklearn.linear_model import Lasso
error_min = 100000
for alpha in range(1,100):
    lasso_reg = Lasso(alpha=alpha/10)
    err = np.mean(cross_val_score(lasso_reg,X_train,y_train,scoring='neg_mean_absolute_error',cv=3))
    if abs(err) < error_min:
        error_min = err
        best_alpha = alpha
    

In [45]:
error_min

-19.392010763299535

In [46]:
best_alpha

1

# 6.) Model3 - RandomForest Regressor

In [51]:
'''better suited for our use case as we know multicollinearity(correlation between input variables) exists ex: sector and industry.
Secondly, categorical variables are betterhandled by tree based models'''

'better suited for our use case as we know multicollinearity(correlation between input variables) exists ex: sector and industry.\nSecondly, categorical variables are betterhandled by tree based models'

In [52]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [53]:
cross_val_score(rf, X_train, y_train, scoring = 'neg_mean_absolute_error', cv=3)

array([-16.13598485, -15.08285354, -13.82878173])

In [54]:
'''Best model upto now'''
np.mean(cross_val_score(rf, X_train, y_train, scoring = 'neg_mean_absolute_error', cv=3))

-15.021388034319505

# 7.) Hyperparamater tuning using GridSearchCV

In [55]:
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':range(10,300,10), 'criterion':['mse','mae'], 'max_features':['auto','sqrt','log2']}

In [56]:
gs = GridSearchCV(rf, parameters, scoring = 'neg_mean_absolute_error', cv=3)

In [57]:
gs.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [58]:
gs.best_score_   #better than default random forest

-14.841805632795483

In [59]:
gs.best_estimator_

# 8.) Predicting on test set using ensemble of our lasso and optimized random forest

In [61]:
lm = Lasso(alpha=0.1).fit(X_train, y_train)

In [62]:
y_pred_lm = lm.predict(X_test)
y_pred_rf = gs.best_estimator_.predict(X_test)

In [63]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred_lm)

19.891027041040033

In [64]:
mean_absolute_error(y_test, y_pred_rf)

11.039029426948892

In [67]:
mean_absolute_error(y_test, (y_pred_lm+y_pred_rf)/2)

14.92618186475992