# Task 2 - Prediction of store sales

In [1]:
import warnings
warnings.filterwarnings("ignore")
#Data Manipulation and Treatment
import numpy as np
import pandas as pd
from datetime import datetime
#Plotting and Visualizations
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from scipy import stats
import itertools
#dvc
import dvc.api
import mlflow

In [14]:
#Scikit-Learn for Modeling
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, accuracy_score, precision_recall_curve, f1_score, mean_squared_error, r2_score, mean_absolute_error, auc

In [3]:
#utils 
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))
import plot
import load_save_file
from f_logger import F_Logger
from df_info import Df_Info

In [4]:
#The Merged data Set
df_train_store=load_save_file.get_data('MST_v2','data/CleanStoreTrain.csv')
info = Df_Info(df_train_store, deep=True)

In [5]:
df_train_store.head() 

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,DaysAfterHoliday,DaysToHoliday,SchoolHoliday,CompetitionDistance,...,is_Assortment_c,is_StoreType_a,is_StoreType_b,is_StoreType_c,is_StoreType_d,is_PromoInteval_0,"is_PromoInteval_Feb,May,Aug,Nov","is_PromoInteval_Jan,Apr,Jul,Oct","is_PromoInteval_Mar,Jun,Sept,Dec",CompetitionOpenSince
0,1,5,5263,555,1,1,0,54837,1,1270.0,...,0,0,0,1,0,1,0,0,0,82.0
1,2,5,6064,625,1,1,0,54836,1,570.0,...,0,1,0,0,0,0,0,1,0,92.0
2,3,5,8314,821,1,1,0,54835,1,14130.0,...,0,1,0,0,0,0,0,1,0,103.0
3,4,5,13995,1498,1,1,0,54834,1,620.0,...,1,0,0,1,0,1,0,0,0,70.0
4,5,5,4822,559,1,1,0,54833,1,29910.0,...,0,1,0,0,0,1,0,0,0,3.0


## Training using Random Forest Regressor

In [6]:
def rmspe(y, yhat):
    rmspe = np.sqrt(np.mean( (y - yhat)**2 ))
    return rmspe

In [16]:
def calculate_metrics(y_test, y_preds):
    rmse = np.sqrt(mean_squared_error(y_test, y_preds))
    r_sq = r2_score(y_test, y_preds)
    mae = mean_absolute_error(y_test, y_preds)

    return {'RMSE Score': rmse, 'R2_Squared': r_sq, 'MAE Score': mae}

In [8]:
features = df_train_store.drop(['Customers', 'Sales', 'SalesperCustomer'], axis = 1) 
targets=np.log(df_train_store.Sales)

In [9]:
X_train, X_train_test, y_train, y_train_test = model_selection.train_test_split(features, targets, test_size=0.20, random_state=15)
print ("Training and testing split was successful.")

Training and testing split was successful.


In [23]:
mlflow.autolog()
rf = RandomForestRegressor()
with mlflow.start_run() as run:
    rf.fit(X_train, y_train)

    train_score = rf.score(X_train, y_train)
    test_score = rf.score(X_train_test,y_train_test)
    test_metrics = calculate_metrics(y_train_test, rf.predict(X_train_test))
    
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)

2021/07/31 10:18:06 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2021/07/31 10:18:06 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


In [24]:
train_score

0.9886554624337542

In [27]:
test_score

0.9203019182722776

In [25]:
test_metrics

{'RMSE Score': 0.1203822352475731,
 'R2_Squared': 0.9203019182722776,
 'MAE Score': 0.08367487020049236}

In [26]:
features = pd.DataFrame()
features["Feature"] = X_train.columns
features["Importance"] = rf.feature_importances_
features.sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
7,CompetitionDistance,0.221529
0,Store,0.179984
3,Promo,0.163595
28,CompetitionOpenSince,0.076296
1,DayOfWeek,0.056513
5,DaysToHoliday,0.039632
10,Promo2SinceYear,0.038805
11,Month,0.032404
13,Day,0.031499
9,Promo2SinceWeek,0.030449
