## Execute Regression Analysis for Weekly_Sales

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score

#### Define functions

In [32]:
def mape(actual: DataFrame, predicted: DataFrame) -> float:
    mask = actual != 0
    return (np.fabs(actual - predicted) / actual)[mask].mean()

def run_cross_validation(_model, x, y) -> None:
    print('\nExecuting Cross Validation =>')
    r2 = cross_val_score(_model, x, y, cv=10, scoring='r2')
    mse = cross_val_score(_model, x, y, cv=10, scoring='neg_mean_squared_error')
    print("R2: %0.2f (+/- %0.2f)" % (r2.mean(), r2.std() * 2))
    print("MSE: %0.2f (+/- %0.2f)" % (mse.mean(), mse.std() * 2))

def run_regr_for_model(_model, _x_train: DataFrame, _x_test: DataFrame, _y_train: DataFrame, _y_test: DataFrame) -> None:
    _model = _model.fit(_x_train, _y_train)
    _y_pred = _model.predict(_x_test)
    print('MSE: %.2f' %mean_squared_error(_y_test, _y_pred))
    print('MAPE: %.2f' %mape(_y_test, _y_pred), '%')
    print('R2: %.2f' %r2_score(_y_test, _y_pred))

def run_regr_models(x: DataFrame, y: DataFrame) -> None:
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
    print('\n', x_train.shape, x_test.shape, y_train.shape, y_test.shape)
    print('\nExecuting Linear Regression =>')
    lr = LinearRegression()
    run_regr_for_model(lr, x_train, x_test, y_train, y_test)
    run_cross_validation(lr, x, y)

    print('\nExecuting KNN Regression =>')
    knn = KNeighborsRegressor()
    run_regr_for_model(knn, x_train, x_test, y_train, y_test)
    run_cross_validation(knn, x, y)

def get_final_model(x: DataFrame, y: DataFrame):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
    lr = LinearRegression()
    _model = lr.fit(x_train, y_train)
    return _model

def run_final_model(_model, _x_test: DataFrame, _y_test: DataFrame) -> None:
    _y_pred = _model.predict(_x_test)
    print('MSE: %.2f' %mean_squared_error(_y_test, _y_pred))
    print('MAPE: %.2f' %mape(_y_test, _y_pred), '%')
    print('R2: %.2f' %r2_score(_y_test, _y_pred))

#### Read training dataset from pickle file

In [34]:
master: DataFrame = pd.read_pickle('./data/master.pickle')
print(master.shape)

(344667, 119)


#### Select features for the regression analysis

In [35]:
missing_depts = [15, 53, 57, 61, 62, 63, 64, 66, 68, 69, 70, 73, 75, 76, 84, 86, 88, 89]
depts = ['Dept_' + str(i) for i in range(1, 99) if i not in missing_depts]
months = ['Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 
          'Month_10', 'Month_11', 'Month_12']
markdown = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']

In [36]:
features_0 = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size', 'IsHoliday_False',  'IsHoliday_True', 
              'HasMarkDown_False', 'HasMarkDown_True']
features_1 = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size', 'IsHoliday_False', 'IsHoliday_True', 
              'HasMarkDown_False', 'HasMarkDown_True'] + depts
features_2 = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size', 'IsHoliday_False', 'IsHoliday_True', 
              'HasMarkDown_False', 'HasMarkDown_True'] + depts + months

target = ['Weekly_Sales']

#### Running Regression models

In [39]:
run_regr_models(master[features_0], master[target])


 (275733, 9) (68934, 9) (275733, 1) (68934, 1)

Executing Linear Regression =>
MSE: 484722369.69
MAPE: 334.54 %

Executing KNN Regression =>


In [41]:
run_regr_models(master[features_1], master[target])


 (275733, 89) (68934, 89) (275733, 1) (68934, 1)

Executing Linear Regression =>


MemoryError: Unable to allocate array with shape (89, 275733) and data type float64

In [None]:
run_regr_models(master[features_2], master[target])

#### Running finalized model on test set

In [None]:
test: DataFrame = pd.read_pickle('./data/test.pickle')
print(test.shape)

In [None]:
model = get_final_model(master[features_1], master[target])

In [None]:
run_final_model(model, test[features_1], test[target])
