### Chosen models
- DecisionTree with 2 features, 5 splits and outliers
- GradientBoostingRegressor with no outliers, 3 splits and 9 features
- Lasso with 5 features, 2 splits and outliers

In [1]:
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor

### Load data
> Note: Best features are in order according to importance

In [2]:
from scipy import stats
import pandas as pd
import numpy as np

df = pd.read_csv('../data/5_selected_Kbest/selected_Kbest.csv').sort_values('job_id').drop('job_id', axis=1)

def outliers_index(df, threshold=3):
    z_scores = np.abs(stats.zscore(df))
    threshold = 3
    outliers = df[z_scores > threshold]
    
    print(f'{len(outliers[outliers.notnull().any(axis=1)])} rows contain at least one outlier')
    print('Outlier ratio:', f'{len(outliers[outliers.notnull().any(axis=1)])/len(df):.2%}')
    return outliers.notnull().any(axis=1)
outliers = outliers_index(df)


31 rows contain at least one outlier
Outlier ratio: 20.67%


### Decision Tree
2 features, 5 splits and outliers
> best params: {'max_depth': 1, 'max_leaf_nodes': None, 'min_samples_leaf': 5, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.1, 'splitter': 'best'}

In [3]:
random_state = 42

dt_parameters={"splitter": ["best","random"],
               "max_depth": [1,3,5,7,9,11,12],
               "min_samples_leaf": [5,7,10,15,20],
               "min_samples_split": [5,10,15,20,30,40,50],
               "min_weight_fraction_leaf": [0.1,0.2,0.3,0.5],
               "max_leaf_nodes": [None,10,20,30,40]}

dt_tss = TimeSeriesSplit(n_splits=5)

dt_scaler = StandardScaler()
dt_X = dt_scaler.fit_transform(df.iloc[:, :2])
dt_scaled = dt_scaler.fit_transform(df)
dt_X = dt_scaled[:,:2]
dt_y = dt_scaled[:,-1]

dt_tuning_model = GridSearchCV(estimator=DecisionTreeRegressor(random_state=random_state),
                               param_grid=dt_parameters,
                               scoring='neg_mean_squared_error',
                               cv=dt_tss, verbose=1, n_jobs=-1)

dt_tuning_model.fit(dt_X, dt_y)
dt_tuning_model.best_params_

Fitting 5 folds for each of 9800 candidates, totalling 49000 fits


{'max_depth': 1,
 'max_leaf_nodes': None,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.1,
 'splitter': 'best'}

### Lasso
5 features, 2 splits and outliers

> best params: {'alpha': 0.07, 'max_iter': 100, 'tol': 0.05}

In [4]:
lasso_parameters={"alpha": [0.005, 0.02, 0.03, 0.05, 0.06, 0.07, 0.1, 0.2, 0.3, 0.5, 0.6, 0.7],
                  "max_iter": [100, 200, 500, 1000],
                  "tol": [1e-2, 5e-2, 1e-3, 5e-3, 1e-4]
                  }

lasso_tss = TimeSeriesSplit(n_splits=2)
lasso_scaler = StandardScaler()
lasso_scaled = lasso_scaler.fit_transform(df)
lasso_X = lasso_scaled[:,:5]
lasso_y = lasso_scaled[:,-1]
lasso_tuning_model = GridSearchCV(estimator=Lasso(random_state=random_state),
                                  param_grid=lasso_parameters,
                                  scoring='neg_mean_squared_error',
                                  cv=lasso_tss, verbose=1,  n_jobs=-1)
lasso_tuning_model.fit(lasso_X, lasso_y)
lasso_tuning_model.best_params_

Fitting 2 folds for each of 240 candidates, totalling 480 fits


{'alpha': 0.2, 'max_iter': 100, 'tol': 0.01}

### GradientBoostedTree
no outliers, 3 splits and 9 features

In [5]:
gbt_parameters={"alpha": [0.005, 0.03, 0.06, 0.07, 0.1, 0.2,],
                "learning_rate" : [1e-3,1e-2, 1e-1],
                "n_estimators": [100, 200, 500, 1000],
                "min_samples_leaf": [7,10, 20, 30],
                "min_samples_split": [10,15,20,30,40],
                "min_weight_fraction_leaf": [0.1,0.2,0.3,0.5,],
                "tol": [1e-2, 1e-3, 1e-4],
                "max_leaf_nodes": [None,10,20,30],
                "ccp_alpha": [2e-3,1e-2,25e-3],
                  }

gbt_tss = TimeSeriesSplit(n_splits=3)

gbt_scaler = StandardScaler()
gbt_scaled = gbt_scaler.fit_transform(df[~outliers])
gbt_X = gbt_scaled[:,:9]
gbt_y = gbt_scaled[:, -1]

gbt_tuning_model = GridSearchCV(estimator=GradientBoostingRegressor(random_state=random_state),
                                param_grid=gbt_parameters,
                                scoring='neg_mean_squared_error',
                                cv=gbt_tss, verbose=1, n_jobs=-1)

gbt_tuning_model.fit(gbt_X, gbt_y)
gbt_tuning_model.best_params_

Fitting 3 folds for each of 207360 candidates, totalling 622080 fits
