# Salary Predictions Based on Job Descriptions

In [1]:
import pandas as pd
import numpy as np
import random
import sys
sys.path.insert(0, './modules')
from data import DataProcessing
from feature_engineering import FeatureEngineering
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from xgboost import XGBRegressor
import gc

## Define parameters 

In [3]:
train_features_file = 'unzipped_data/data/train_features.csv'
target_file = 'unzipped_data/data/train_salaries.csv'
test_features_file = 'unzipped_data/data/test_features.csv'
cat_cols = ['jobType', 'degree', 'major', 'industry']
target = 'salary'
id_cols = ['jobId', 'companyId']
num_procs = 2
verbose_lvl = 5

## Create Data object

In [4]:
data = DataProcessing(train_features_file, target_file, test_features_file, target, id_cols)

## Create Feature object

In [5]:
new_features = FeatureEngineering(data)

## Feature Ingeneering for train/test set

In [6]:
train_df = new_features.merge_agg_cols(data.train_df, cat_cols)
train_df = new_features.update_dfs(train_df)
train_df.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,salary,yearsExperience_bins,...,industry_STD,industry_MEAN,industry_MEDIAN,industry_SKEW,yearsExperience_bins_MIN,yearsExperience_bins_MAX,yearsExperience_bins_STD,yearsExperience_bins_MEAN,yearsExperience_bins_MEDIAN,yearsExperience_bins_SKEW
0,JOB1362684407687,COMP37,CFO,MASTERS,MATH,HEALTH,10,83,130,10+,...,36.865179,115.73554,114,0.321032,24,301,38.471103,126.159674,124,0.317313
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73,101,3-5,...,38.165659,121.645362,119,0.347584,21,245,33.226156,99.051617,98,0.254207
2,JOB1362684407689,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38,137,10+,...,36.865179,115.73554,114,0.321032,24,301,38.471103,126.159674,124,0.317313
3,JOB1362684407690,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17,142,5+,...,36.09159,109.435222,108,0.322708,21,261,34.11343,105.923943,105,0.254626
4,JOB1362684407691,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16,163,5+,...,38.31982,130.747659,128,0.344985,21,261,34.11343,105.923943,105,0.254626


In [7]:
test_df = new_features.merge_agg_cols(data.test_df, cat_cols)
test_df = new_features.update_dfs(test_df)
test_df.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,yearsExperience_bins,"(salary, min)",...,industry_STD,industry_MEAN,industry_MEDIAN,industry_SKEW,yearsExperience_bins_MIN,yearsExperience_bins_MAX,yearsExperience_bins_STD,yearsExperience_bins_MEAN,yearsExperience_bins_MEDIAN,yearsExperience_bins_SKEW
0,JOB1362685407687,COMP33,MANAGER,HIGH_SCHOOL,NONE,HEALTH,22,73,10+,49,...,36.865179,115.73554,114,0.321032,24,301,38.471103,126.159674,124,0.317313
1,JOB1362685407688,COMP13,JUNIOR,NONE,NONE,AUTO,20,47,10+,31,...,36.09159,109.435222,108,0.322708,24,301,38.471103,126.159674,124,0.317313
2,JOB1362685407689,COMP10,CTO,MASTERS,BIOLOGY,HEALTH,17,9,10+,88,...,36.865179,115.73554,114,0.321032,24,301,38.471103,126.159674,124,0.317313
3,JOB1362685407690,COMP21,MANAGER,HIGH_SCHOOL,NONE,OIL,14,96,10+,62,...,38.528558,130.953863,128,0.358658,24,301,38.471103,126.159674,124,0.317313
4,JOB1362685407691,COMP36,JUNIOR,DOCTORAL,BIOLOGY,OIL,10,44,10+,69,...,38.528558,130.953863,128,0.358658,24,301,38.471103,126.159674,124,0.317313


In [8]:
train_df = new_features.get_dummies(train_df)
test_df = new_features.get_dummies(test_df)

## Creating train/test set for model and prediction

In [9]:
target_df = train_df.salary
feats = [f for f in train_df.columns if f not in ['salary', 'jobId', 'companyId']]
train = train_df[feats]
test = test_df[feats]

In [10]:
# trigger a manual garbage collection to clean up memory
del train_df, test_df
gc.collect()

259

## Define a cross validation strategy

In [11]:
def cross_validate(model, df, target_df, num_procs=2):
    kfolds = KFold(n_splits=5, shuffle=True)
    mean_mse = -1.0 * np.mean(cross_val_score(model, df, target_df, cv=kfolds, n_jobs=num_procs,
                                             scoring='neg_mean_squared_error'))
    return mean_mse  

## Base models

In [12]:
rfr = RandomForestRegressor(n_estimators=60, n_jobs=num_procs, max_depth=15, min_samples_split=80, \
                            max_features=8, verbose=verbose_lvl)

In [38]:
lightgbm = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, min_data_in_leaf =6)

In [14]:
xgboost = XGBRegressor(learning_rate=0.5,n_estimators=460,
                                     max_depth=3,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear')

## Base models scores

In [40]:
score = cross_validate(lightgbm, train, target_df)
print("lightgbm: {:.4f}\n".format(score))

lightgbm: 354.8683



In [16]:
score = cross_validate(xgboost, train, target_df)
print("xgboost: {:.4f}\n".format(score))

xgboost: 356.6496



In [25]:
score = cross_validate(rfr, train, target_df)
print("random_forest: {:.4f}\n".format(score))

random_forest: 372.2179



## Fit and score test data with best model (lgb_model)

In [17]:
lgb_model = lightgbm.fit(train, target_df)
lgb_model.predict(test)

array([111.65676773,  92.36309047, 179.76026234, ...,  56.49144468,
       161.66009947, 111.49867017])

## Hyperparameters with Random Search

For random (or grid search) the domain is called a hyperparameter grid and uses discrete values for the hyperparameters.

First, let's look at all of the hyperparamters that need to be tuned.

In [11]:
lgb.LGBMRegressor()

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

Based on the default values, can be constructed the following hyperparameter grid.

In [12]:
param_grid = {
    'class_weight': [None, 'balanced'],
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(5, 50)),
    'learning_rate': list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10))
}

How can be sampled a set of hyperparameters from the grid using a dictionary comprehension.

In [13]:
params = {key: random.sample(value, 1)[0] for key, value in param_grid.items()}
params

{'class_weight': 'balanced',
 'boosting_type': 'gbdt',
 'num_leaves': 9,
 'learning_rate': 0.02126210003400329,
 'subsample_for_bin': 140000,
 'min_child_samples': 440,
 'reg_alpha': 0.18367346938775508,
 'reg_lambda': 0.36734693877551017,
 'colsample_bytree': 0.9111111111111111}

## Cross Validation with Early Stopping in LightGBM
The scikit-learn cross validation api does not include the option for early stopping. Therefore, it is possible to use the LightGBM cross validation function with 100 early stopping rounds. To use this function, it needed to be created a dataset from the features and labels.

In [14]:
train_set = lgb.Dataset(train, label=target_df)

The number of boosting rounds is very high, but it will not actually trained this many estimators because it is using early stopping to stop training when the validation score has not improved for 100 estimators.

In [15]:
cv_results = lgb.cv(params, train_set, num_boost_round = 10000, nfold = 10, metrics = 'mse', 
           early_stopping_rounds = 100, verbose_eval = 50, seed = 50)

[50]	cv_agg's l2: 611.755 + 2.09828
[100]	cv_agg's l2: 426.543 + 1.45515
[150]	cv_agg's l2: 377.265 + 1.08619
[200]	cv_agg's l2: 362.152 + 1.00083
[250]	cv_agg's l2: 357.247 + 1.00672
[300]	cv_agg's l2: 355.581 + 1.02677
[350]	cv_agg's l2: 354.989 + 1.05055
[400]	cv_agg's l2: 354.742 + 1.06201
[450]	cv_agg's l2: 354.615 + 1.0655
[500]	cv_agg's l2: 354.547 + 1.06749
[550]	cv_agg's l2: 354.499 + 1.06894
[600]	cv_agg's l2: 354.46 + 1.07097
[650]	cv_agg's l2: 354.424 + 1.07296
[700]	cv_agg's l2: 354.391 + 1.07467
[750]	cv_agg's l2: 354.368 + 1.08025
[800]	cv_agg's l2: 354.343 + 1.08144
[850]	cv_agg's l2: 354.324 + 1.08012
[900]	cv_agg's l2: 354.307 + 1.07894
[950]	cv_agg's l2: 354.289 + 1.08114
[1000]	cv_agg's l2: 354.274 + 1.08374
[1050]	cv_agg's l2: 354.261 + 1.08508
[1100]	cv_agg's l2: 354.251 + 1.08707
[1150]	cv_agg's l2: 354.24 + 1.08705
[1200]	cv_agg's l2: 354.231 + 1.09
[1250]	cv_agg's l2: 354.221 + 1.08866
[1300]	cv_agg's l2: 354.213 + 1.0895
[1350]	cv_agg's l2: 354.206 + 1.08806
[

In [17]:
optimum_boost_rounds = np.argmin(cv_results['l2-mean'])
print('Optimum boost rounds = {}'.format(optimum_boost_rounds))
print('Best LGBM CV result = {}'.format(np.min(cv_results['l2-mean']))) 

Optimum boost rounds = 2115
Best LGBM CV result = 354.16849782213546
