In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/4_engineered/engineered_features.csv')

In [None]:
import random
from typing import Literal
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_absolute_error as mae, mean_squared_error as mse
def baseline_dummy_metrics(df:pd.DataFrame, samples:int, strategy:Literal['median','mean']):
    '''estimate baseline values for dataset using sklearn dummy regressor
    samples is the number of fits to be done

    returns
    -------
    df with r2_score, mean absolute error, mean squared error and std deviations respectively
    average for every fit 
    '''
    values = pd.DataFrame()
    for i in range(samples):
        y = df['color']
        X = df.drop('color', axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random.randint(1,1000))

        dummy_regr = DummyRegressor(strategy=strategy)
        dummy_regr.fit(X_train, y_train)
        
        y_pred = dummy_regr.predict(X_test)
        values = pd.concat([values, pd.DataFrame.from_records({'r2_score':[r2_score(y_test,y_pred)], 'mae':[mae(y_test,y_pred)], 'mse':[mse(y_test,y_pred)]})], ignore_index=True)
    _dict = {}
    for col in values.columns:
        _dict[col] = values[col].mean()
        _dict[f'{col}_std'] = values[col].std()
    return pd.DataFrame.from_records([_dict])

baseline = baseline_dummy_metrics(df, 10000, 'median')
print('Baseline values for metrics')
baseline

### Selecting Kbest


In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
X = df.drop('color', axis=1)
y = df.color
selector = SelectKBest(f_regression, k=len(X.columns))
selector.set_output(transform='pandas')
X_new = selector.fit_transform(X, y)


In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from tqdm import tqdm
random_state = 42
decision_tree = DecisionTreeRegressor(random_state=random_state)
random_forest = RandomForestRegressor(random_state=random_state)

scorers = dict()
scorers['r2_score'] = make_scorer(r2_score, greater_is_better=True)
scorers['mae'] = make_scorer(mae, greater_is_better=False)
scorers['mse'] = make_scorer(r2_score, greater_is_better=False)

def search(estimator, range_time_split, range_features,scorers, X, y):
    df = pd.DataFrame()
    for n_splits in tqdm(range_time_split):
        tss = TimeSeriesSplit(n_splits = n_splits)
        for n_features in tqdm(range_features):
            for scorer in scorers:
                scores = cross_val_score(estimator, X.iloc[:, :n_features], y, cv=tss, scoring=scorers[scorer])
                new =  pd.DataFrame({'model':[estimator.__class__.__name__], 'n_features':[n_features], 'n_splits':[n_splits], 'scorer':[scorer], 'scores':[scores]})
                df = pd.concat([df, new], ignore_index=True)
    return df

a = pd.concat([search(est, range(2,6), range(2, 16), scorers, X_new,y) for est in [decision_tree, random_forest]])


In [None]:
dt = DecisionTreeRegressor(random_state=random_state)
dt.fit(X_new[-5:, :10], y[-5:])

In [None]:
b = a.assign(mean_scores=a.scores.apply(np.mean)).reset_index(drop=True)
b = b.pivot(index=['model','n_features', 'n_splits'], columns='scorer', values=['mean_scores']).reset_index().sort_values(('mean_scores', 'r2_score')).reset_index(drop=True)
b.columns = ['model', 'n_features', 'n_splits', 'mae', 'mse','r2_score']
b.sort_values('r2_score', ascending=False)
