### Load selected rows

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/selected/selected.csv')
df.describe()

In [None]:
import random
from typing import Literal
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_absolute_error as mae, mean_squared_error as mse
def baseline_dummy_metrics(df:pd.DataFrame, samples:int, strategy:Literal['median','mean']):
    '''estimate baseline values for dataset using sklearn dummy regressor
    samples is the number of fits to be done

    returns
    -------
    df with r2_score, mean absolute error, mean squared error and std deviations respectively
    average for every fit 
    '''
    values = pd.DataFrame()
    for i in range(samples):
        y = df['color']
        X = df.drop('color', axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random.randint(1,1000))

        dummy_regr = DummyRegressor(strategy=strategy)
        dummy_regr.fit(X_train, y_train)
        
        y_pred = dummy_regr.predict(X_test)
        values = pd.concat([values, pd.DataFrame.from_records({'r2_score':[r2_score(y_test,y_pred)], 'mae':[mae(y_test,y_pred)], 'mse':[mse(y_test,y_pred)]})], ignore_index=True)
    _dict = {}
    for col in values.columns:
        _dict[col] = values[col].mean()
        _dict[f'{col}_std'] = values[col].std()
    return pd.DataFrame.from_records([_dict])

baseline = baseline_dummy_metrics(df, 10000, 'median')
print('Baseline values for metrics')
baseline

### Split data 

In [None]:

X = df.drop('color', axis=1)
y = df['color']

### Selecting columns by correlation with target



In [None]:
def correlation_contest(df:pd.DataFrame, agg:Literal['sum', 'mean'], target:str='color')->pd.DataFrame:
    methods = ['pearson', 'spearman', 'kendall']
    corr = pd.concat([df.corr(method)[target].abs().to_frame(method) for method in methods], axis=1)
    corr = (corr.assign(**{agg:getattr(corr, agg)(axis=1)})
                .reset_index(names='columns')
                .sort_values(agg, ascending=False)
                .reset_index(drop=True)
                )
    return corr.query(f'columns != "{target}"').style.background_gradient(cmap='Greens', vmin=0, vmax=1, subset=methods + [agg])

correlation_contest(df, 'sum')

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

regressor = DecisionTreeRegressor(random_state=10)
tss = TimeSeriesSplit(n_splits = 2,)

scorers = dict()
scorers['r2_score'] = make_scorer(r2_score, greater_is_better=True)
scorers['mae'] = make_scorer(mae, greater_is_better=False)
scorers['mse'] = make_scorer(r2_score, greater_is_better=False)

for s in scorers:
    cvs = cross_val_score(regressor, X, y, cv=tss, scoring=scorers[s])
    print(s, 'Mean:', cvs.mean())
    print(cvs, )