In [2]:
import pandas as pd
from scipy.stats import pearsonr

In [7]:
'Correlation for all datasets'

complete_df = pd.read_csv("../csv/processed/all_features.csv")
average_df = pd.read_csv("../csv/processed/train_avg_sales.csv")
predict_df = pd.read_csv("../csv/processed/predicted_sales_rft.csv")

discard_df = complete_df[complete_df['Sales'] != -1]

complete_df['Sales'] = average_df['Sales']
average_df = complete_df.copy()

complete_df['Sales'] = predict_df['Sales']
predict_df = complete_df.copy()

def calc_corr(df):

    cols = [col for col in df.columns if (col != 'Sales' and col != 'Date' and col != 'Company')]

    corr_coef = []

    for col in cols:
        coef, _ = pearsonr(df[col], df['Sales'])
        corr_coef.append(coef)

    corr_df = pd.DataFrame({'Variable': cols, 'Correlation Coefficient': corr_coef})

    return corr_df

corr = calc_corr(discard_df)

corr

Unnamed: 0,Variable,Correlation Coefficient
0,is_holiday,-0.009367
1,weekday,0.131567
2,avg_weekday_sales,0.836449
3,avg_month_sales,0.815152
4,Precipitation,-0.002773
5,Temperature,-0.002653


# Correlation Company 0

In [11]:
'Method 1 - Discard missing values'
disc_0_df = discard_df[discard_df['Company'] == 0]
corr_disc_0 = calc_corr(disc_0_df)

'Method 2 - Calculate average values'
avg_0_df = average_df[average_df['Company'] == 0]
corr_avg_0 = calc_corr(avg_0_df)

'Method 3 - Predict values with rft'
pred_0_df = predict_df[predict_df['Company'] == 0]
corr_pred_0 = calc_corr(pred_0_df)

print(corr_disc_0)
print()
print(corr_avg_0)
print()
print(corr_pred_0)


            Variable  Correlation Coefficient
0         is_holiday                -0.041732
1            weekday                 0.373323
2  avg_weekday_sales                 0.751410
3    avg_month_sales                 0.228162
4      Precipitation                 0.004561
5        Temperature                 0.157914

            Variable  Correlation Coefficient
0         is_holiday                -0.048704
1            weekday                 0.288732
2  avg_weekday_sales                 0.768281
3    avg_month_sales                 0.225354
4      Precipitation                -0.007869
5        Temperature                 0.164310

            Variable  Correlation Coefficient
0         is_holiday                -0.039812
1            weekday                 0.283388
2  avg_weekday_sales                 0.764844
3    avg_month_sales                 0.226531
4      Precipitation                -0.009744
5        Temperature                 0.168646


# Correlation Company 1

In [12]:
'Method 1 - Discard missing values'
disc_1_df = discard_df[discard_df['Company'] == 1]
corr_disc_1 = calc_corr(disc_1_df)

'Method 2 - Calculate average values'
avg_1_df = average_df[average_df['Company'] == 1]
corr_avg_1 = calc_corr(avg_1_df)

'Method 3 - Predict values with rft'
pred_1_df = predict_df[predict_df['Company'] == 1]
corr_pred_1 = calc_corr(pred_1_df)

print(corr_disc_1)
print()
print(corr_avg_1)
print()
print(corr_pred_1)

            Variable  Correlation Coefficient
0         is_holiday                 0.015308
1            weekday                 0.358504
2  avg_weekday_sales                 0.616566
3    avg_month_sales                 0.301033
4      Precipitation                -0.022446
5        Temperature                -0.134457

            Variable  Correlation Coefficient
0         is_holiday                 0.010989
1            weekday                 0.374925
2  avg_weekday_sales                 0.620083
3    avg_month_sales                 0.294791
4      Precipitation                -0.022180
5        Temperature                -0.139480

            Variable  Correlation Coefficient
0         is_holiday                 0.007985
1            weekday                 0.380158
2  avg_weekday_sales                 0.634771
3    avg_month_sales                 0.291651
4      Precipitation                -0.014123
5        Temperature                -0.100106


# Correlation Company 2

In [13]:
'Method 1 - Discard missing values'
disc_2_df = discard_df[discard_df['Company'] == 2]
corr_disc_2 = calc_corr(disc_2_df)

'Method 2 - Calculate average values'
avg_2_df = average_df[average_df['Company'] == 2]
corr_avg_2 = calc_corr(avg_2_df)

'Method 3 - Predict values with rft'
pred_2_df = predict_df[predict_df['Company'] == 2]
corr_pred_2 = calc_corr(pred_2_df)

print(corr_disc_2)
print()
print(corr_avg_2)
print()
print(corr_pred_2)

            Variable  Correlation Coefficient
0         is_holiday                -0.068028
1            weekday                 0.232455
2  avg_weekday_sales                 0.414711
3    avg_month_sales                 0.300759
4      Precipitation                -0.005616
5        Temperature                -0.055812

            Variable  Correlation Coefficient
0         is_holiday                -0.068028
1            weekday                 0.232455
2  avg_weekday_sales                 0.414711
3    avg_month_sales                 0.300759
4      Precipitation                -0.005616
5        Temperature                -0.055812

            Variable  Correlation Coefficient
0         is_holiday                -0.068028
1            weekday                 0.232455
2  avg_weekday_sales                 0.414711
3    avg_month_sales                 0.300759
4      Precipitation                -0.005616
5        Temperature                -0.055812
