### Load selected rows

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
df = pd.read_csv('../data/3_selected/selected.csv')
df.describe()


### Load features from dropped columns (product, date_time)
- day of week has moderate positive correlation (pearson)
- day of month has high correlation with 1st malt color (pearson, spearman)
- last_was_hnk and shift have some correlation to feature (spearman)


In [None]:
from src.load import dropped_columns_features
from src.corr import plot_correlation_bars
import seaborn as sns
import matplotlib.pyplot as plt


# df = df.merge(dropped_columns_features(), on='job_id').drop('job_id',axis=1)

method = ['pearson', 'spearman']
for m in method: 
    plt.figure(figsize = (16,8))
    cols = ['color'] + [i for i in df.columns if i != 'color']
    corr = df[cols].corr(m)
    ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, annot=corr.values, fmt='.2f', cmap='bwr')
    ax.set(title=f'{m.title()} Correlation Heatmap')
    plot_correlation_bars(df, 'color', m)


### New features to check variable interaction
logging, squaring and taking square root of variables did not have any improments 

Proceeed to combinate columns using functions, and then picked some that had the best variations and considering with they were already used in other picked combination or not


In [None]:
from typing import Callable
from itertools import combinations
import math

def multiply(a, b): return a*b

def divide(a, b): return a/(b+1) 
    
def foo(a, b): return (a*b)/(a+b)

def square(x): return x*x


def create_feature(df:pd.DataFrame, funcs:list[Callable]):
    _df = df.copy() 
    def asd(f, x):
        try: return f(x)
        except: return 0
    for c in _df.columns:
        for f in funcs:
            
            _df = _df.assign(**{f'{f.__name__}_{c}': df[c].apply(lambda x:  asd(f,x))})
    return _df

def create_feature2(df:pd.DataFrame, funcs:list[Callable]):
    _df = df.copy() 
    _combinations = list(combinations([i for i in df.columns if i != 'color'], 2))
    for c in _combinations:
        for f in funcs:
            _df = _df.assign(**{f'{f.__name__}__{c[0]}__{c[1]}': df.apply(lambda row: f(row[c[0]], row[c[1]]), axis=1)})
    return _df

a = create_feature(df, funcs=[math.log, square, math.sqrt])
plot_correlation_bars(a[[i for i in a.columns if '_color' not in i]], 'color', m)


### try to combine features to see if correlations improve
roast amount combination were filtered, it was the best correlation so no need to increase complexity in this one

In [None]:
from IPython.display import clear_output
b = create_feature2(df, funcs=[multiply, divide, foo])
clear_output()

In [None]:


def new_features_corr(df:pd.DataFrame, method:str):
  return (df.corr(method)['color'].to_frame(method)
            .abs()
            .sort_values(method, ascending=False)
            .reset_index(names='columns')
            )
spearman_nf = new_features_corr(b, 'spearman')
pearson_nf = new_features_corr(b, 'pearson')
nf_corr = spearman_nf.merge(pearson_nf, on ='columns')
original_corr = nf_corr.iloc[:, :3]
nf_corr = (nf_corr.query('columns.str.contains("__")')
                  .assign(function=nf_corr['columns'].str.split("__").str[0])
                  .assign(first_feature=nf_corr['columns'].str.split("__").str[1])
                  .assign(second_feature=nf_corr['columns'].str.split("__").str[2]))
nf_corr = (nf_corr.merge(original_corr, right_on='columns', left_on='first_feature', suffixes=('', '_first'))
                  .merge(original_corr, right_on='columns', left_on='second_feature', suffixes=('', '_second'))
           )

nf_corr = (nf_corr.assign(first_best=nf_corr.apply(lambda row: row['spearman_first'] if row['spearman_first']>row['pearson_first'] else row['pearson_first'], axis=1))
                  .assign(second_best=nf_corr.apply(lambda row: row['spearman_second'] if row['spearman_second']>row['pearson_second'] else row['pearson_second'], axis=1)))

nf_corr = (nf_corr.assign(previous_best=nf_corr.apply(lambda row: row['first_best'] if row['first_best']>row['second_best'] else row['second_best'], axis=1))
                  .assign(best=nf_corr.apply(lambda row: row['spearman'] if row['spearman']>row['pearson'] else row['pearson'], axis=1))
                  .rename({i: f'new_{i}' for i in ['spearman', 'pearson']}, axis=1)
                  )

nf_corr = nf_corr.assign(delta=nf_corr.apply(lambda row: row['best'] - row['previous_best'], axis=1)).sort_values('delta', ascending=False)


(nf_corr[['columns', 'best', 'previous_best', 'delta','function', 'first_feature','first_best', 'second_feature', 'second_best']]

 .query('~columns.str.contains("roast_amount")')
 .head(30)
 .reset_index(drop=True)

 )
keep = ['divide__total_cold_wort__woc_time', 'multiply__1st_malt_amount_kg__extract', 'multiply__2nd_malt_amount_kg__ph', 'divide__woc_time__whp_transfer_time',
        'multiply__whp_rest_time__2nd_malt_color', 'multiply__mt_time__wk_time']

# b is selected df + new engineered features 
b = b[list(df.columns) + keep]
b.to_csv('../data/4_engineered/engineered_features.csv', index=False)