### Chosen models
- DecisionTree with 2 features, 5 splits and outliers
- GradientBoostingRegressor with no outliers, 3 splits and 9 features
- Lasso with 5 features, 2 splits and outliers

In [13]:
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor

### Load data
> Note: Best features are in order according to importance

In [14]:
from scipy import stats
import pandas as pd
import numpy as np

df = pd.read_csv('../data/5_selected_Kbest/selected_Kbest.csv').sort_values('job_id').drop('job_id', axis=1)

def outliers_index(df, threshold=3):
    z_scores = np.abs(stats.zscore(df))
    threshold = 3
    outliers = df[z_scores > threshold]
    
    print(f'{len(outliers[outliers.notnull().any(axis=1)])} rows contain at least one outlier')
    print('Outlier ratio:', f'{len(outliers[outliers.notnull().any(axis=1)])/len(df):.2%}')
    return outliers.notnull().any(axis=1)
outliers = outliers_index(df)

df_no_out = df[~outliers]

31 rows contain at least one outlier
Outlier ratio: 20.67%


### hold some rows for testing

In [15]:
# train set
df = df.iloc[:-int(len(df)*0.2), :]
df_no_out = df_no_out.iloc[:-int(len(df_no_out)*0.2), :]
print('train set len with outliers:', len(df))
print('train len without outliers:', len(df_no_out))

# test set
test = df.iloc[-int(len(df)*0.2):, :]
test_no_out = df_no_out.iloc[-int(len(df_no_out)*0.2):, :]
print('\ntest set len with outliers:', len(test))
print('test len without outliers:', len(test_no_out))

test.to_csv('../data/7_model_specific_data_sets/test_with_outliers.csv', index=False)
test_no_out.to_csv('../data/7_model_specific_data_sets/test_without_outliers.csv', index=False)

train set len with outliers: 120
train len without outliers: 96

test set len with outliers: 24
test len without outliers: 19


### LGBM

In [16]:

random_state = 42

lgbm_parameters={"reg_alpha": [0.005, 0.03, 0.07, 0.1, 0.2,],
                 "reg_lambda": [0.005, 0.03, 0.07, 0.1, 0.2,],
                 "learning_rate" : [1e-3,1e-2, 1e-1],
                 "n_estimators": [ 100, 200, 500],
                 "num_leaves": [8, 16, 32],
                 "max_depth": [2,3,-1],
                
                  }

lgbm_tss = TimeSeriesSplit(n_splits=3)

lgbm_scaler = StandardScaler()


lgbm_scaled = lgbm_scaler.fit_transform(df)

lgbm_X = lgbm_scaled[:,:15]
lgbm_y = lgbm_scaled[:, -1:]

# lgbm_tuning_model = GridSearchCV(estimator=LGBMRegressor(random_state=random_state),
#                                  param_grid=lgbm_parameters,
#                                  scoring='neg_mean_squared_error',
#                                  cv=lgbm_tss, verbose=3, n_jobs=-1, error_score='raise')

# lgbm_tuning_model.fit(lgbm_X, lgbm_y)
# lgbm_tuning_model.best_params_



### Decision Tree
2 features, 5 splits and outliers
> best params: {'max_depth': 1, 'max_leaf_nodes': None, 'min_samples_leaf': 5, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.1, 'splitter': 'best'}

In [18]:
random_state = 42

dt_parameters={"splitter": ["best","random"],
               "max_depth": [1,3,5,7,9,11,12],
               "min_samples_leaf": [5,7,10,15,20],
               "min_samples_split": [5,10,15,20,30,40,50],
               "min_weight_fraction_leaf": [0.1,0.2,0.3,0.5],
               "max_leaf_nodes": [None,10,20,30,40]}

dt_tss = TimeSeriesSplit(n_splits=5)

dt_scaler = StandardScaler()

dt_X = dt_scaler.fit_transform(df.iloc[:, :2])
dt_scaled = dt_scaler.fit_transform(df)

dt_X = dt_scaled[:,:2]
dt_y = dt_scaled[:,-1]

# dt_tuning_model = GridSearchCV(estimator=DecisionTreeRegressor(random_state=random_state),
#                                param_grid=dt_parameters,
#                                scoring='neg_mean_squared_error',
#                                cv=dt_tss, verbose=1, n_jobs=-1)

# dt_tuning_model.fit(dt_X, dt_y)
# dt_tuning_model.best_params_

### Lasso
5 features, 2 splits and outliers

> best params: {'alpha': 0.07, 'max_iter': 100, 'tol': 0.05}

In [19]:
lasso_parameters={"alpha": [0.005, 0.02, 0.03, 0.05, 0.06, 0.07, 0.1, 0.2, 0.3, 0.5, 0.6, 0.7],
                  "max_iter": [100, 200, 500, 1000],
                  "tol": [1e-2, 5e-2, 1e-3, 5e-3, 1e-4]
                  }

lasso_tss = TimeSeriesSplit(n_splits=2)

lasso_scaler = StandardScaler()
lasso_scaled = lasso_scaler.fit_transform(df)

lasso_X = lasso_scaled[:,:5]
lasso_y = lasso_scaled[:,-1]

# lasso_tuning_model = GridSearchCV(estimator=Lasso(random_state=random_state),
#                                   param_grid=lasso_parameters,
#                                   scoring='neg_mean_squared_error',
#                                   cv=lasso_tss, verbose=1,  n_jobs=-1)
# lasso_tuning_model.fit(lasso_X, lasso_y)
# lasso_tuning_model.best_params_

### GradientBoostedTree
no outliers, 3 splits and 9 features - Training 2h 

>{'alpha': 0.005,
> 'ccp_alpha': 0.025,
> 'learning_rate': 0.1,
> 'max_leaf_nodes': None,
> 'min_samples_leaf': 10,
> 'min_samples_split': 30,
> 'min_weight_fraction_leaf': 0.1,
> 'n_estimators': 500,
> 'tol': 0.01}

In [20]:

gbt_parameters={"alpha": [0.005, 0.03, 0.07, 0.1,],
                "learning_rate" : [1e-3,1e-2, 1e-1],
                "n_estimators": [ 100, 200],
                "min_samples_leaf": [10, 30],
                
                "min_weight_fraction_leaf": [0.1,0.2,0.5,],
                "tol": [1e-2, 1e-3, 1e-4],
                "max_leaf_nodes": [None,4,8,16],
                "ccp_alpha": [2e-3,1e-2,25e-3],
                  }

gbt_tss = TimeSeriesSplit(n_splits=3)

gbt_scaler = StandardScaler()
gbt_scaled = gbt_scaler.fit_transform(df_no_out)

gbt_X = gbt_scaled[:,:9]
gbt_y = gbt_scaled[:, -1]

# gbt_tuning_model = GridSearchCV(estimator=GradientBoostingRegressor(random_state=random_state),
#                                 param_grid=gbt_parameters,
#                                 scoring='neg_mean_squared_error',
#                                 cv=gbt_tss, verbose=1, n_jobs=-1)

# gbt_tuning_model.fit(gbt_X, gbt_y)
# gbt_tuning_model.best_params_

### Train with best parameters

In [21]:
random_state = 42
l = {'alpha': 0.2, 'max_iter': 100, 'tol': 0.01}
d = {'max_depth': 1,
     'max_leaf_nodes': None,
     'min_samples_leaf': 5,
     'min_samples_split': 50,
     'min_weight_fraction_leaf': 0.5,
     'splitter': 'best'}
g = {'alpha': 0.005,
     'ccp_alpha': 0.025,
     'learning_rate': 0.1,
     'max_leaf_nodes': None,
     'min_samples_leaf': 10,
     'min_samples_split': 30,
     'min_weight_fraction_leaf': 0.1,
     'n_estimators': 500,
     'tol': 0.01}
gbt = GradientBoostingRegressor(**g, random_state=random_state).fit(gbt_X, gbt_y)
lasso = Lasso(**l, random_state=random_state).fit(lasso_X, lasso_y)
dt = DecisionTreeRegressor(**d, random_state=random_state).fit(dt_X, dt_y)

In [None]:
lg = {'learning_rate': 0.1,
 'max_depth': 3,
 'n_estimators': 500,
 'num_leaves': 16,
 'reg_alpha': 0.2,
 'reg_lambda': 0.005}
lgbm = LGBMRegressor(**lg, random_state=random_state).fit(lgbm_X, lgbm_y)

In [None]:
import pickle
def train_with_n_features(estimator, n_features, df):
    TimeSeriesSplit(n_splits=3)

    scaler = StandardScaler()
    scaled = scaler.fit_transform(df)
    
    X = scaled[:,:n_features]
    y = scaled[:, -1]
    estimator.fit(X, y)
    if estimator.__class__.__name__ == "LGBMRegressor":
        estimator.booster_.save_model(f'../models/n_features_test/{estimator.__class__.__name__}_{n_features}.text')
    else:
        with open(f'../models/n_features_test/{estimator.__class__.__name__}_{n_features}.pkl','wb') as file:
            pickle.dump(estimator, file)
    
    # save scalers
    with open(f'../models/scalers/{estimator.__class__.__name__}_{n_features}_scaler.pkl','wb') as file:
        pickle.dump(scaler, file)

for i in [2,4,5,8,10,12,15]:
    for model in [gbt, lasso, dt, lgbm]:
        train_with_n_features(model, i, df)


### Save models and datasets 

In [24]:

import pickle
_m = {'gbt':[gbt, gbt_scaler], 'lasso':[lasso, lasso_scaler], 'dt':[dt, dt_scaler, dt_scaler], 'lgbm':[lgbm, lgbm_scaler]}

for i in _m:
    #save models
    with open(f'../models/{i}.pkl','wb') as file:
        pickle.dump(_m[i][0], file)
    
    # save scalers
    with open(f'../models/scalers/{i}_scaler.pkl','wb') as file:
        pickle.dump(_m[i][1], file)
lgbm.booster_.save_model('../models/lgbm.txt')

# datasets
lgbm_scaled = pd.DataFrame(lgbm_scaled, columns=df.columns)[list(df.columns[:9]) + ['color']]
lgbm_scaled.to_csv('../data/7_model_specific_data_sets/gbt_scaled.csv', index=False)

gbt_scaled = pd.DataFrame(gbt_scaled, columns=df.columns)[list(df.columns[:9]) + ['color']]
gbt_scaled.to_csv('../data/7_model_specific_data_sets/gbt_scaled.csv', index=False)

lasso_scaled = pd.DataFrame(lasso_scaled, columns=df.columns)[list(df.columns[:5]) + ['color']]
lasso_scaled.to_csv('../data/7_model_specific_data_sets/lasso_scaled.csv', index=False)

dt_scaled = pd.DataFrame(dt_scaled, columns=df.columns)[list(df.columns[:2]) + ['color']]
dt_scaled.to_csv('../data/7_model_specific_data_sets/dt_scaled.csv', index=False)



