In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import pandas as pd

In [3]:
train_features_df = pd.read_csv("train_features.csv")
train_labels_df = pd.read_csv("train_salaries.csv")
test_features_df = pd.read_csv("test_features.csv")

In [4]:
train_features_df.shape, train_labels_df.shape, test_features_df.shape

((1000000, 8), (1000000, 2), (1000000, 8))

In [5]:
train_df = pd.merge(train_features_df, train_labels_df, on='jobId')

train_df.shape

(1000000, 9)

In [6]:
train_df.columns

Index(['jobId', 'companyId', 'jobType', 'degree', 'major', 'industry',
       'yearsExperience', 'milesFromMetropolis', 'salary'],
      dtype='object')

In [7]:
test_features_df.columns

Index(['jobId', 'companyId', 'jobType', 'degree', 'major', 'industry',
       'yearsExperience', 'milesFromMetropolis'],
      dtype='object')

### Concatenate test and train dataframes to apply same operations/transformations on them

In [8]:
train_df["Type"] = "TRAIN"

test_features_df["salary"] = 0
test_features_df["Type"] = "TEST"

assert train_df.shape == test_features_df.shape

In [9]:
train_df.columns, test_features_df.columns

(Index(['jobId', 'companyId', 'jobType', 'degree', 'major', 'industry',
        'yearsExperience', 'milesFromMetropolis', 'salary', 'Type'],
       dtype='object'),
 Index(['jobId', 'companyId', 'jobType', 'degree', 'major', 'industry',
        'yearsExperience', 'milesFromMetropolis', 'salary', 'Type'],
       dtype='object'))

In [10]:
concat_df = pd.concat([train_df, test_features_df])

concat_df.shape

(2000000, 10)

### Create one-hot vectors for categorical variables

In [11]:
categorical_variables = ['jobType', 'degree', 'major', 'industry']

expanded_df = pd.get_dummies(concat_df, columns=categorical_variables)

In [12]:
expanded_df.head()

Unnamed: 0,jobId,companyId,yearsExperience,milesFromMetropolis,salary,Type,jobType_CEO,jobType_CFO,jobType_CTO,jobType_JANITOR,...,major_MATH,major_NONE,major_PHYSICS,industry_AUTO,industry_EDUCATION,industry_FINANCE,industry_HEALTH,industry_OIL,industry_SERVICE,industry_WEB
0,JOB1362684407687,COMP37,10,83,130,TRAIN,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
1,JOB1362684407688,COMP19,3,73,101,TRAIN,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,JOB1362684407689,COMP52,10,38,137,TRAIN,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,JOB1362684407690,COMP38,8,17,142,TRAIN,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,JOB1362684407691,COMP7,8,16,163,TRAIN,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [13]:
expanded_df.columns

Index(['jobId', 'companyId', 'yearsExperience', 'milesFromMetropolis',
       'salary', 'Type', 'jobType_CEO', 'jobType_CFO', 'jobType_CTO',
       'jobType_JANITOR', 'jobType_JUNIOR', 'jobType_MANAGER',
       'jobType_SENIOR', 'jobType_VICE_PRESIDENT', 'degree_BACHELORS',
       'degree_DOCTORAL', 'degree_HIGH_SCHOOL', 'degree_MASTERS',
       'degree_NONE', 'major_BIOLOGY', 'major_BUSINESS', 'major_CHEMISTRY',
       'major_COMPSCI', 'major_ENGINEERING', 'major_LITERATURE', 'major_MATH',
       'major_NONE', 'major_PHYSICS', 'industry_AUTO', 'industry_EDUCATION',
       'industry_FINANCE', 'industry_HEALTH', 'industry_OIL',
       'industry_SERVICE', 'industry_WEB'],
      dtype='object')

In [14]:
transformed_train_df = expanded_df[expanded_df['Type'] == 'TRAIN']
transformed_test_df = expanded_df[expanded_df['Type'] == 'TEST']

assert transformed_train_df.shape == transformed_test_df.shape

### Prepare data

In [15]:
target = transformed_train_df['salary']

transformed_train_df.drop(['jobId', 'companyId', 'salary', 'Type',], axis=1, inplace=True)

transformed_test_df.drop(['jobId', 'companyId', 'salary', 'Type',], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [16]:
target.shape, transformed_train_df.shape

((1000000,), (1000000, 31))

In [17]:
assert transformed_test_df.shape == transformed_train_df.shape

# Model Fitting

In [18]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(transformed_train_df, target, test_size=0.10, random_state=42)

In [20]:
X_train.shape, X_test.shape

((900000, 31), (100000, 31))

In [21]:
def rmse(model):
    y_preds = model.predict(X_train)
    return math.sqrt(mean_squared_error(y_train, y_preds))

## skleran's liner models

### Ridge Regression

In [22]:
from sklearn.linear_model import Ridge

In [23]:
ridge_regression = Ridge()
ridge_regression.fit(X_train, y_train)

rmse(ridge_regression)

19.607111489864252

### SGD Regresson

In [24]:
from sklearn.linear_model import SGDRegressor

In [25]:
sgd_regression = SGDRegressor(alpha=0.01, l1_ratio=.8, random_state=42)
sgd_regression.fit(X_train, y_train)

rmse(sgd_regression)

21.60568612433875

### ElasticNet Model

In [26]:
from sklearn.linear_model import ElasticNet

In [27]:
elasticnet_model = ElasticNet(alpha=0.01, l1_ratio=.8, random_state=42)
elasticnet_model.fit(X_train, y_train)

rmse(elasticnet_model)

19.6127677060746

## sklearn's ensemble methods

### RandomForestRegressor

In [28]:
from sklearn.ensemble import RandomForestRegressor

In [29]:
random_forest_model = RandomForestRegressor(n_estimators=100, min_samples_leaf=8, 
                                            n_jobs=-1, random_state=42)
random_forest_model.fit(X_train, y_train)

rmse(random_forest_model)

16.969353686785638

### GradientBoostingRegressor

In [30]:
from sklearn.ensemble import GradientBoostingRegressor

In [31]:
gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, 
                                                    min_samples_leaf=8, random_state=42, alpha=0.01)

gradient_boosting_model.fit(X_train, y_train)
rmse(gradient_boosting_model)

19.36082115058263

## XGBRegressor

In [32]:
import xgboost as xgb

In [33]:
model_xgb = xgb.XGBRegressor(learning_rate=0.1, booster="gbtree", n_estimators=100,
                             reg_alpha=0.5, reg_lambda=0.6, nthread=-1)

model_xgb.fit(X_train, y_train)
rmse(model_xgb)

18.841844966259064

## LGBMRegressor

In [34]:
import lightgbm as lgb

In [35]:
lgbm_regressor = lgb.LGBMRegressor(objective='regression', learning_rate=0.1, n_estimators=100,
                                  reg_alpha=0.5, reg_lambda=0.6, random_state=42)

lgbm_regressor.fit(X_train, y_train)
rmse(lgbm_regressor)

18.88309735742432

# Computing RMSE on X_test

In [36]:
ridge_predictions = ridge_regression.predict(X_test)
sgd_predictions = sgd_regression.predict(X_test)
enet_predictions = elasticnet_model.predict(X_test)
random_forest_predictions = random_forest_model.predict(X_test)
gboost_predictions = gradient_boosting_model.predict(X_test)
xgb_predictions = model_xgb.predict(X_test)
lgb_predictions = lgbm_regressor.predict(X_test)

In [37]:
def rmse_test(y_preds):
    return math.sqrt(mean_squared_error(y_test, y_preds))

models = ["RidgeRegressor", "SGDRegressor", "ElasticNet", "RandomForestRegressor", 
          "GradientBoostingRegressor", "XGBRegressor", "LGBMRegressor"]

errors = [rmse_test(ridge_predictions), rmse_test(sgd_predictions), rmse_test(enet_predictions), 
          rmse_test(random_forest_predictions), rmse_test(gboost_predictions), rmse_test(xgb_predictions), 
          rmse_test(lgb_predictions)]

test_errors = pd.DataFrame(errors, index=models)

test_errors.transpose()

Unnamed: 0,RidgeRegressor,SGDRegressor,ElasticNet,RandomForestRegressor,GradientBoostingRegressor,XGBRegressor,LGBMRegressor
0,19.613244,21.595582,19.617787,19.342647,19.377137,18.916082,18.931015


# Predictions on X_test

In [38]:
predict_df = pd.DataFrame({'Actual':y_test, 'RidgeRegressor':ridge_predictions, 'SGDRegressor': sgd_predictions, 
                           'ElasticNet': enet_predictions, "RandomForestRegressor": random_forest_predictions,
                          'XGBRegressor': xgb_predictions, 'LGBMRegressor': lgb_predictions})

In [39]:
predict_df.head(n=10)

Unnamed: 0,Actual,RidgeRegressor,SGDRegressor,ElasticNet,RandomForestRegressor,XGBRegressor,LGBMRegressor
987231,174,142.228033,135.351615,141.73269,133.538177,142.688477,139.682132
79954,58,72.680789,64.553539,73.29963,67.589279,71.585991,67.029861
567130,168,176.299473,168.984249,175.676802,185.614351,183.475952,183.257872
500891,85,110.078304,95.739447,109.460393,122.551715,116.186844,116.387901
55399,145,127.89356,117.632057,128.391945,125.585016,125.269218,121.301348
135049,135,125.345044,116.396483,125.248924,124.282562,126.116043,126.010782
733378,153,123.805822,115.255575,123.286253,123.510449,117.049805,124.201465
732057,82,78.012637,66.004862,77.824963,88.957241,88.854866,88.736054
51333,120,121.577242,112.323047,121.054454,117.691702,115.250198,116.326451
731479,147,134.433864,117.97726,133.784978,130.729104,128.22467,132.828906
