### Load dataset and rename columns
> Note: **Only AMST** are contained in df. HNK products where filtered.

In [None]:
import pandas as pd
import numpy as np
from re import sub

def column_name_to_snake_case(s):
    s = sub(r"[\/( \- ) ]", '_', s)
    s = sub(r"(___)|(__)", '_', s).lower()
    return s if s[-1] != '_' else s[:-1]

df = pd.read_csv(r'..\data\raw\Heineken - Data Science Use Case.csv', parse_dates=['Date/Time'])
df = (df.drop(columns=df.columns[0])
        .rename(columns={col:column_name_to_snake_case(col) for col in df.columns[1:]})
        .query('product == "AMST"')
     )
print(len(df))

### Rows with null values

In [None]:
nulls = df[pd.isnull(df).any(axis=1)]
print('len nulls:', len(nulls))


#### Analyzing target (color) null values
All color null values have no other null value besides the target column.
> They will be used later to predict missing values

In [None]:
# only null that are target 
target_nulls = nulls.query('color.isnull()')
print(target_nulls.drop('color', axis=1).isnull().sum(axis=0))

# saving to parse dtypes faster later 
dtypes = {col:target_nulls[col].dtype for col in target_nulls.columns}

target_nulls.to_csv(r'..\data\test\test_data.csv', index=False)



#### Check non_target nulls
Null values from other column correspond to small percentage of total count of row count.

We will use interpolation with KNN to estimate missing values.

In [None]:
non_target = nulls.query('color.notnull()')
print('non_target nulls len:', len(non_target), '\n')
non_target = non_target.drop('color', axis=1).isnull().sum(axis=0)[lambda x: x >0].to_frame('null_count')
non_target['len_df'] = len(df)
non_target['ratio'] = non_target.null_count/len(df)

print(non_target)

# using interpolation to fill NAs on roast amount and ph
for col in ['roast_amount_kg', 'ph']:
    df[col] = df[col].interpolate(method='nearest')

print('\n\nnull values after interpolation')
pd.isnull(df).sum()[lambda x: x > 0]

### Features

In [None]:
import seaborn as sns
features = (df.select_dtypes(include='number')
             .dropna()
             .drop(columns=['job_id','color'])
             .dropna()
             )
feature_cols = features.columns
features.describe()


### Check normality of features
All features were found to be normally distributed. 

roast_color	is constant, so it will be dropped.

In [None]:
from scipy.stats import shapiro, kstest

print('len features:', len(features))
shapiro_p_value = [shapiro(features[col].values).pvalue for col in feature_cols]
kstest_p_value = [kstest(features[col].values, 'norm').pvalue for col in feature_cols]

normality_tests = pd.DataFrame(
    {'columns': feature_cols, 'shapiro_p_value': shapiro_p_value, 'kstest_p_value': kstest_p_value})

normality_tests['failed'] = normality_tests.apply(lambda row: 'yes' if row['shapiro_p_value'] > 0.05 or row['kstest_p_value'] > 0.05 else 'no', axis=1)
normality_tests

> note: roast color is constant, so it will be dropped

In [None]:
features = features.drop('roast_color', axis=1)

#### Check normality of target


In [None]:
shapiro_p_value = [shapiro(df['color'].values).pvalue]
kstest_p_value = [kstest(df['color'].values, 'norm').pvalue]
target_normality = pd.DataFrame(
    {'columns': ['color'], 'shapiro_p_value': shapiro_p_value, 'kstest_p_value': kstest_p_value})
print(target_normality)
df[['color']].hist()


In [None]:
df['base_amount_kg'] = df['1st_malt_amount_kg'] + df['2nd_malt_amount_kg']
# Multiplied by 2 just offset lines a little bit
df.roast_amount_kg = df.roast_amount_kg*2
df['color (target)'] = df.color

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,5)
ax = df.sort_values('date_time').reset_index().plot(x='index', y=['ph', 'roast_amount_kg', 'color (target)'])
ax.set(title='Valores nulos do dataset')

# undo changes to dataset
df.roast_amount_kg = df.roast_amount_kg/2

In [None]:
ax = df[['color']].boxplot()
ax.set(title='Target box plot')

### Outlier Detection

In [None]:
from scipy import stats


#apply the z-score method and get abs 
z_scores = np.abs(stats.zscore(features))


threshold = 3
outliers = features[z_scores > threshold]
print(f'{len(outliers[outliers.notnull().any(axis=1)])} rows contain at least one outlier')
outliers[outliers.notnull().any(axis=1)]

## Correlation plots

In [None]:
from typing import Literal
import matplotlib.pyplot as plt
def plot_correlation_heatmap(df:pd.DataFrame, corr_method:Literal['pearson', 'spearman', 'kendall']):
    plt.rcParams["figure.figsize"] = (10,10)
    
    corr_df = df.dropna().corr(corr_method)
    ax = sns.heatmap(corr_df, 
                    vmin=-1, vmax=1, center=0, 
                    cmap='bwr', annot=corr_df.values,annot_kws={'fontsize':8})
    ax.set(title=f'{corr_method.title()} Correlation plot')
    
    # absolute correlation 
    plt.rcParams["figure.figsize"] = (20,5)
    corr_abs = (corr_df[['color']].abs()
                              .sort_values('color', ascending=False)
                              .reset_index(names=['columns'])
                              .query('columns != "color"'))
    ax2  = corr_abs.plot.bar(x='columns', y='color', )
    ax2.set(title=f'Absolute {corr_method.title()} Correlation by Feature')
    def addlabels(x,y):
        for i in range(len(x)):
            plt.text(i-0.2,y[i]+0.005,y[i])
    addlabels(corr_abs.index, [float('%.3f'%x) for x in corr_abs.color.values])

    return ax, ax2

plot_correlation_heatmap(df[['color'] + list(feature_cols)], 'pearson')

##### insights:
total_cold_wort is highly correlated with:

    - extract
    - woc_time

wk_time and temperature are highly correlated with each other

1st and 2nd malt amount are highly correlated with each other

In [None]:
plot_correlation_heatmap(df[['color'] + list(feature_cols)], 'spearman')

##### Insights
total_cold_wort has many strong correlations with:

    - woc_time
    - wk_time
    - whp_time
  
1st malt amount has good correlation with:

    - wk_temperature
    
2nd malt amount has good correlation with:

    - whp_rest_time

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split

x_train, y_train, x_test, y_test  = train_test_split()
dummy_regr = DummyRegressor(strategy="mean")

usar time_series_train_test_split

treinar com e sem outliers