### Import engineered features

In [16]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/4_engineered/engineered_features.csv')



### Baseline model

In [17]:
import random
from typing import Literal
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_absolute_error as mae, mean_squared_error as mse
def baseline_dummy_metrics(df:pd.DataFrame, samples:int, strategy:Literal['median','mean']):
    '''estimate baseline values for dataset using sklearn dummy regressor
    samples is the number of fits to be done

    returns
    -------
    df with r2_score, mean absolute error, mean squared error and std deviations respectively
    average for every fit 
    '''
    values = pd.DataFrame()
    # scale features using zscore
    X = df.drop('color', axis=1)
    X = stats.zscore(X)
    y = df['color']
    y = stats.zscore(y)
    for i in range(samples):
        
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random.randint(1,1000))

        dummy_regr = DummyRegressor(strategy=strategy)
        dummy_regr.fit(X_train, y_train)
        
        y_pred = dummy_regr.predict(X_test)
        values = pd.concat([values, pd.DataFrame.from_records({'r2_score':[r2_score(y_test,y_pred)], 'mae':[mae(y_test,y_pred)], 'mse':[mse(y_test,y_pred)]})], ignore_index=True)
    _dict = {}
    for col in values.columns:
        _dict[col] = values[col].mean()
        _dict[f'{col}_std'] = values[col].std()
    return pd.DataFrame.from_records([_dict])

baseline = baseline_dummy_metrics(df, 5000, 'median')
print('Baseline values for metrics')
baseline

Baseline values for metrics


Unnamed: 0,mae,mae_std,mse,mse_std,r2_score,r2_score_std
0,0.510548,0.115872,1.193168,0.361584,-0.205402,0.066656


### Find outliers

In [18]:
from scipy import stats

def outliers_index(df, threshold=3):
#apply the z-score method and get abs 
    z_scores = np.abs(stats.zscore(df))
    threshold = 3
    outliers = df[z_scores > threshold]
    
    print(f'{len(outliers[outliers.notnull().any(axis=1)])} rows contain at least one outlier')
    print('Outlier ratio:', f'{len(outliers[outliers.notnull().any(axis=1)])/len(df):.2%}')
    return outliers.notnull().any(axis=1)
outliers = outliers_index(df)
df_no_out = df[~outliers]

31 rows contain at least one outlier
Outlier ratio: 20.67%


In [19]:
# train set
df = df.iloc[:-int(len(df)*0.2), :]
df_no_out = df_no_out.iloc[:-int(len(df_no_out)*0.2), :]
print('train set len with outliers:', len(df))
print('train len without outliers:', len(df_no_out))

# test set
test = df.iloc[-int(len(df)*0.2):, :]
test_no_out = df_no_out.iloc[-int(len(df_no_out)*0.2):, :]
print('\ntest set len with outliers:', len(test))
print('test len without outliers:', len(test_no_out))

test.to_csv('../data/7_model_specific_data_sets/test_with_outliers.csv', index=False)
test_no_out.to_csv('../data/7_model_specific_data_sets/test_without_outliers.csv', index=False)

train set len with outliers: 120
train len without outliers: 96

test set len with outliers: 24
test len without outliers: 19


### Selecting Kbest features


In [20]:
from sklearn.feature_selection import SelectKBest, f_regression
def select_k_best(df, name=None):
    X = df.drop(['job_id','color'], axis=1)
    y = df.color
    selector = SelectKBest(f_regression, k=len(X.columns))
    selector.set_output(transform='pandas')
    X_new = selector.fit_transform(X, y)
    selected = pd.concat([X_new, y.to_frame('color')], axis=1)
    
    #save keeping job_id to order in time properly
    df[['job_id']].join(selected).to_csv(f'../data/5_selected_Kbest/selected_Kbest_{name}.csv', index=False)
    return selected

Kbest = select_k_best(df, '')
Kbest_no_out = select_k_best(df, 'no_out')

### Model selection

In [None]:
from tqdm import tqdm
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.base import clone
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

random_state = 42
models = []

models.append(LinearRegression())
models.append(Pipeline(steps=[('polyFeature',PolynomialFeatures() ),('regressor', LinearRegression())]))
models.append(Ridge(random_state=random_state))
models.append(Lasso(random_state=random_state))
models.append(ElasticNet(random_state=random_state))
models.append(SVR())
models.append(GradientBoostingRegressor(random_state=random_state))
models.append(DecisionTreeRegressor(random_state=random_state))
models.append(RandomForestRegressor(random_state=random_state))
models.append(LGBMRegressor(random_state=random_state))


scorers = dict()
scorers['r2_score'] = make_scorer(r2_score, greater_is_better=True)
scorers['mae'] = make_scorer(mae, greater_is_better=False)
scorers['mse'] = make_scorer(r2_score, greater_is_better=False)

def search(estimator, range_time_split, range_features,scorers, outliers,  X, y):
    df = pd.DataFrame()
    print(estimator.__class__.__name__)
    estimator1 = clone(estimator)
    # without outliers
    X = StandardScaler().fit_transform(X)
    y = StandardScaler().fit_transform(y.to_frame()).flatten()
    
    
    for n_splits in range_time_split:
        tss = TimeSeriesSplit(n_splits = n_splits)

        for n_features in range_features:
            for scorer in scorers:
                
                # with outliers
                scores = cross_val_score(estimator1, X[:, :n_features], y, cv=tss, scoring=scorers[scorer])
                _dict = {'model':[estimator.__class__.__name__], 'n_features':[n_features], 'n_splits':[n_splits], 'scorer':[scorer],'outliers':outliers, 'scores':[scores]}
                new =  pd.DataFrame(_dict)
                df = pd.concat([df, new], ignore_index=True)
                
    return df
outliers = outliers_index(df)
models_df = pd.concat([search(m, range(2,6), range(2, 16), scorers, 'yes',  Kbest.iloc[:,:-1], Kbest.color) for m in tqdm(models)])
models_no_out_df = pd.concat([search(m, range(2,6), range(2, 16), scorers, 'no',  Kbest_no_out.iloc[:,:-1], Kbest_no_out.color) for m in tqdm(models)])


### Save results

In [26]:
a = pd.concat([models_df, models_no_out_df], ignore_index=True)
a.to_csv('../data/6_model_selection/models.csv', index=False)
baseline


Unnamed: 0,mae,mae_std,mse,mse_std,r2_score,r2_score_std
0,0.510548,0.115872,1.193168,0.361584,-0.205402,0.066656


In [31]:
b = a.assign(mean_scores=a.scores.apply(np.mean)).reset_index(drop=True)
b = b.pivot(index=['model','n_features', 'outliers', 'n_splits',], columns='scorer', values=['mean_scores']).reset_index().sort_values(('mean_scores', 'r2_score')).reset_index(drop=True)
b.columns = ['model', 'n_features', 'outliers', 'n_splits', 'mae', 'mse','r2_score']
b = b.assign(**{i:b[i].abs() for i in ['mae', 'mse', ]})
b.to_csv('../data/6_model_selection/pivoted_models.csv')
pd.options.display.float_format = '{:.4f}'.format



#### Best model by mae

DecisionTree with 2 features, 5 splits and outliers


In [32]:
b.sort_values('mae').head(3)

Unnamed: 0,model,n_features,outliers,n_splits,mae,mse,r2_score
609,DecisionTreeRegressor,2,no,4,0.6645,2.0237,-2.0237
608,DecisionTreeRegressor,2,yes,4,0.6645,2.0237,-2.0237
595,DecisionTreeRegressor,3,yes,5,0.6656,3.0723,-3.0723


#### Best model by r2_score 
GradientBoostingRegressor with no outliers, 3 splits and 9 features

In [36]:
b.sort_values('r2_score', ascending=False).head(10)

Unnamed: 0,model,n_features,outliers,n_splits,mae,mse,r2_score
1119,LGBMRegressor,15,no,3,0.7966,0.1016,-0.1016
1118,LGBMRegressor,15,yes,3,0.7966,0.1016,-0.1016
1117,LGBMRegressor,12,yes,3,0.8,0.1063,-0.1063
1116,LGBMRegressor,12,no,3,0.8,0.1063,-0.1063
1115,LGBMRegressor,11,yes,3,0.8018,0.1149,-0.1149
1114,LGBMRegressor,11,no,3,0.8018,0.1149,-0.1149
1113,LGBMRegressor,13,yes,3,0.8017,0.116,-0.116
1112,LGBMRegressor,14,no,3,0.8017,0.116,-0.116
1111,LGBMRegressor,14,yes,3,0.8017,0.116,-0.116
1110,LGBMRegressor,13,no,3,0.8017,0.116,-0.116


#### Best model by mse
Since GradientBoostingRegressor with no outliers, 3 splits and 9 features was also the best in this metric, Lasso with 5 features, 2 splits and outliers will be chosen

In [34]:
b.sort_values('mse',).head(3)

Unnamed: 0,model,n_features,outliers,n_splits,mae,mse,r2_score
1119,LGBMRegressor,15,no,3,0.7966,0.1016,-0.1016
1118,LGBMRegressor,15,yes,3,0.7966,0.1016,-0.1016
1116,LGBMRegressor,12,no,3,0.8,0.1063,-0.1063
