In [2]:
#Import the required packages
import numpy as np
import pandas as pd
import math as m
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import *

In [3]:
#Load the trend and predictions data
df = pd.read_csv('data/trends1.csv')
pred_data = pd.read_csv('data/predictions.csv')

In [4]:
#View the first 5 rows of predictions data
pred_data.head()

Unnamed: 0,Week,date,fit top 1,fit top 2,fit top 3,fit2 top 3,fit3 top 3,fit2 top 2,fit3 top 2,fit2 top 1,fit3 top 1
0,0,2012-10-07,6.46402,14.918372,12.8527,17.560586,16.691433,21.967562,19.348604,6.676299,6.814028
1,1,2012-10-14,6.405044,14.879459,13.460605,17.584433,16.864402,20.428943,18.149665,6.553469,6.67846
2,2,2012-10-21,5.59488,12.766397,12.500809,16.287381,15.67293,17.533925,15.621429,5.685904,5.801919
3,3,2012-10-28,4.556118,9.883558,11.040882,14.667064,14.128378,14.478612,13.020325,4.598339,4.706192
4,4,2012-11-04,4.203193,8.345088,10.553082,14.029395,13.566908,12.894532,11.970428,4.200978,4.29765


In [5]:
#View the last 5 rows of predictions data
pred_data.tail()

Unnamed: 0,Week,date,fit top 1,fit top 2,fit top 3,fit2 top 3,fit3 top 3,fit2 top 2,fit3 top 2,fit2 top 1,fit3 top 1
621,621,2018-09-27,16.588328,98.782168,36.344867,36.316258,36.037766,73.258258,64.549945,17.507205,17.533665
622,622,2018-09-28,16.655365,98.884051,36.503322,36.415425,36.139121,73.103604,64.319097,17.567572,17.594394
623,623,2018-09-29,16.724295,99.019655,36.689969,36.538825,36.265359,72.967858,64.104769,17.62976,17.656862
624,624,2018-09-30,16.794364,99.184409,36.900044,36.682254,36.412168,72.847916,63.903832,17.693017,17.720324
625,625,2018-10-01,16.864604,99.372937,37.128318,36.841104,36.574825,72.740112,63.712625,17.756372,17.783823


In [6]:
#View the details of predictions data
pred_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 626 entries, 0 to 625
Data columns (total 11 columns):
Week          626 non-null int64
date          626 non-null object
fit top 1     626 non-null float64
fit top 2     626 non-null float64
fit top 3     626 non-null float64
fit2 top 3    626 non-null float64
fit3 top 3    626 non-null float64
fit2 top 2    626 non-null float64
fit3 top 2    626 non-null float64
fit2 top 1    626 non-null float64
fit3 top 1    626 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 53.9+ KB


In [7]:
#Check whether the predictions data contains any missing value
pred_data.isnull().any()

Week          False
date          False
fit top 1     False
fit top 2     False
fit top 3     False
fit2 top 3    False
fit3 top 3    False
fit2 top 2    False
fit3 top 2    False
fit2 top 1    False
fit3 top 1    False
dtype: bool

In [8]:
#Overall given predictions sales summary
pred_data.describe()

Unnamed: 0,Week,fit top 1,fit top 2,fit top 3,fit2 top 3,fit3 top 3,fit2 top 2,fit3 top 2,fit2 top 1,fit3 top 1
count,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0
mean,312.5,15.792292,69.791001,32.249881,32.004939,31.895551,58.889299,55.834099,16.015668,16.034296
std,180.854914,5.939152,32.946941,8.808045,8.910247,8.934187,26.329259,25.351693,6.097955,6.113293
min,0.0,3.409713,6.166742,8.740487,11.13382,11.350117,9.809581,10.881956,4.037399,4.09691
25%,156.25,14.987423,32.714805,26.747984,26.553807,26.327755,27.137705,27.292373,16.076901,16.079688
50%,312.5,17.607248,80.605119,34.027927,32.933962,32.636063,63.857137,58.780825,18.039448,18.027717
75%,468.75,19.762297,98.681039,39.416604,39.507346,39.543204,80.517375,78.962714,19.66021,19.672679
max,625.0,25.552484,109.182507,44.181066,44.638694,44.827019,91.2941,92.93268,27.682097,28.089302


In [9]:
#Separate the prediction data into 2 parts for calculating RMSE
split_point = len(pred_data) - 365
pred, future = pred_data[:split_point], pred_data[split_point:]
print('Current Prediction %d, Future Prediction %d' % (len(pred), len(future)))

Current Prediction 261, Future Prediction 365


In [13]:
#Obtain the linear trendline(actual) value for all three tops
z1 = np.polyfit(df['Week'], df['Top 1'], 1)
z2 = np.polyfit(df['Week'], df['Top 2'], 1)
z3 = np.polyfit(df['Week'], df['Top 3'], 1)

T1 = z1[0]*df['Week']+ z1[1]
T2 = z2[0]*df['Week']+ z2[1]
T3 = z3[0]*df['Week']+ z3[1]

#Change the label to be shorter
T1F1, T1F2, T1F3 = pred['fit top 1'], pred['fit2 top 1'], pred['fit3 top 1']
T2F1, T2F2, T2F3 = pred['fit top 2'], pred['fit2 top 2'], pred['fit3 top 2']
T3F1, T3F2, T3F3 = pred['fit top 3'], pred['fit2 top 3'], pred['fit3 top 3']

#Print the number of dataset  for linear trendline(actual and predictions) values for all three tops
print('T1 : %d, T1F1 : %d, T1F2 : %d, T1F3 : %d' % (len(T1), len(T1F1), len(T1F2), len(T1F3)))
print('T2 : %d, T2F1 : %d, T2F2 : %d, T2F3 : %d' % (len(T2), len(T2F1), len(T2F2), len(T2F3)))
print('T3 : %d, T3F1 : %d, T3F2 : %d, T3F3 : %d' % (len(T3), len(T3F1), len(T3F2), len(T3F3)))

T1 : 261, T1F1 : 261, T1F2 : 261, T1F3 : 261
T2 : 261, T2F1 : 261, T2F2 : 261, T2F3 : 261
T3 : 261, T3F1 : 261, T3F2 : 261, T3F3 : 261


In [14]:
#Calculate and display the metrics between the given predictions and actual linear trendline values for all three tops
d = {'Metrics' : ['Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error'] , 'T1 Fit1' : [metrics.mean_absolute_error(T1, T1F1), metrics.mean_squared_error(T1, T1F1), np.sqrt(metrics.mean_squared_error(T1, T1F1))] , 'T1 Fit2' : [metrics.mean_absolute_error(T1, T1F2), metrics.mean_squared_error(T1, T1F2), np.sqrt(metrics.mean_squared_error(T1, T1F2))] , 'T1 Fit3' :  [metrics.mean_absolute_error(T1, T1F3), metrics.mean_squared_error(T1, T1F3), np.sqrt(metrics.mean_squared_error(T1, T1F3))], 'T2 Fit1' : [metrics.mean_absolute_error(T2, T2F1), metrics.mean_squared_error(T2, T2F1), np.sqrt(metrics.mean_squared_error(T2, T2F1))] , 'T2 Fit2' : [metrics.mean_absolute_error(T2, T2F2), metrics.mean_squared_error(T2, T2F2), np.sqrt(metrics.mean_squared_error(T2, T2F2))] , 'T2 Fit3' :  [metrics.mean_absolute_error(T2, T2F3), metrics.mean_squared_error(T2, T2F3), np.sqrt(metrics.mean_squared_error(T2, T2F3))], 'T3 Fit1' : [metrics.mean_absolute_error(T3, T3F1), metrics.mean_squared_error(T3, T3F1), np.sqrt(metrics.mean_squared_error(T3, T3F1))] , 'T3 Fit2' : [metrics.mean_absolute_error(T3, T3F2), metrics.mean_squared_error(T3, T3F2), np.sqrt(metrics.mean_squared_error(T3, T3F2))] , 'T3 Fit3' :  [metrics.mean_absolute_error(T3, T3F3), metrics.mean_squared_error(T3, T3F3), np.sqrt(metrics.mean_squared_error(T3, T3F3))]}
metricval = pd.DataFrame(data=d)
metricval.set_index('Metrics')

Unnamed: 0_level_0,T1 Fit1,T1 Fit2,T1 Fit3,T2 Fit1,T2 Fit2,T2 Fit3,T3 Fit1,T3 Fit2,T3 Fit3
Metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Mean Absolute Error,3.420986,3.705306,3.706811,9.610813,11.236923,11.640792,2.805818,3.09426,3.14014
Mean Squared Error,16.112638,20.365568,20.630943,126.71258,189.549101,201.306484,11.189274,13.605637,14.207091
Root Mean Squared Error,4.014055,4.512823,4.54213,11.256668,13.767683,14.188252,3.345037,3.688582,3.76923


In [18]:
def r_sqd(week, pred1, pred2, pred3):
    slope1, intercept1, r_value1, p_value1, std_err1 = linregress(week, pred1)
    slope2, intercept2, r_value2, p_value2, std_err2 = linregress(week, pred2)
    slope3, intercept3, r_value3, p_value3, std_err3 = linregress(week, pred3)
    
    rsq1, rsq2, rsq3 = pow(r_value1,2), pow(r_value2,2), pow(r_value3,2)
    print "Fit 1 : %.6f"%(rsq1)
    print "Fit 2 : %.6f"%(rsq2)
    print "Fit 3 : %.6f"%(rsq3)

#Calculate the R-squared value (coefficient of determination)   
#Print the R-squared value for all three tops
print("Top 1") 
r_sqd(df['Week'], pred['fit top 1'], pred['fit2 top 1'], pred['fit3 top 1'])
print("----------------------------") 
print("Top 2") 
r_sqd(df['Week'], pred['fit top 2'], pred['fit2 top 2'], pred['fit3 top 2'])
print("----------------------------") 
print("Top 3") 
r_sqd(df['Week'], pred['fit top 3'], pred['fit2 top 3'], pred['fit3 top 3'])

Top 1
Fit 1 : 0.765994
Fit 2 : 0.718849
Fit 3 : 0.716124
----------------------------
Top 2
Fit 1 : 0.794184
Fit 2 : 0.720415
Fit 3 : 0.708068
----------------------------
Top 3
Fit 1 : 0.825671
Fit 2 : 0.792317
Fit 3 : 0.784597
