In [1]:
# used for manipulating directory paths
import os
import pandas as pd
import numpy as np

# Plotting library
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # needed to plot 3-D surfaces

# tells matplotlib to embed plots within the notebook
%matplotlib inline

In [2]:
df = pd.read_csv('train.csv')
msk = np.random.rand(len(df)) <= 0.8

train = df[msk]
test = df[~msk]
train.head()
len(train)

337396

In [20]:
# getting the correlation matrix
train.apply(lambda x: x.factorize()[0]).corr()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
Store,1.0,0.014862,0.004034,0.1494,-0.001042
Dept,0.014862,1.0,0.00201,0.046698,0.000812
Date,0.004034,0.00201,1.0,0.003156,-0.042377
Weekly_Sales,0.1494,0.046698,0.003156,1.0,0.005764
IsHoliday,-0.001042,0.000812,-0.042377,0.005764,1.0


In [3]:
test.head()
len(test)

84174

In [4]:
train=train.replace('-','',regex=True).astype(int)
train = train.replace('FALSE', '0', regex=True).astype(int)

test=test.replace('-','',regex=True).astype(int)
test = test.replace('FALSE', '0', regex=True).astype(int)

In [6]:
x1 = train["Store"]
x2 = train["Dept"]
x3 = train["IsHoliday"]
y = train["Weekly_Sales"]
m_train = y.size
print(x1.shape)
print(x2.shape)
print(x3.shape)


(337396,)
(337396,)
(337396,)


In [7]:
# Convert our variables datatype from series to array :

x1 = np.array(x1)
x2 = np.array(x2)
x3 = np.array(x3)

x1 = x1
x2 = np.power(x2,2)
x3 = np.power(x3,3)

y = np.array(y)

In [8]:
# Create a “ones” matrix :
x_bias = np.ones((m_train,1))

# Reshape our data so that we can perform operations like addition and multiplication with x_bias

x1_new = np.reshape(x1,(m_train,1))
x2_new = np.reshape(x2,(m_train,1))
x3_new = np.reshape(x3,(m_train,1))


#Create a major matrix with all the columns:

x_new = np.append(x_bias,x1_new,axis=1)
x_new = np.append(x_new,x2_new,axis=1)
x_new = np.append(x_new,x3_new,axis=1)


x_new

array([[1.000e+00, 1.000e+00, 1.000e+00, 0.000e+00],
       [1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00],
       [1.000e+00, 1.000e+00, 1.000e+00, 0.000e+00],
       ...,
       [1.000e+00, 4.500e+01, 9.604e+03, 0.000e+00],
       [1.000e+00, 4.500e+01, 9.604e+03, 0.000e+00],
       [1.000e+00, 4.500e+01, 9.604e+03, 0.000e+00]])

In [10]:
#Find transpose of a matrix :

x_new_transpose = np.transpose(x_new)

#Perform multiplication:

x_new_transpose_dot_x_new = x_new_transpose.dot(x_new)

#AFter regularization
lambd = -18
IdentityMatrix= np.zeros((4, 4))
#x_new_transpose_dot_x_new = x_new_transpose_dot_x_new + (lambd*IdentityMatrix)

# find the inverse

temp_1 = np.linalg.inv(x_new_transpose_dot_x_new + (lambd*IdentityMatrix))

#perform multiplication

temp_2 = x_new_transpose.dot(y)

#Finding coefficients :

theta = temp_1.dot(temp_2)

theta

array([ 1.50100341e+04, -1.64992277e+02,  1.58022873e+00,  1.13328705e+03])

In [11]:
x1_test = test["Store"]
x2_test = test["Dept"]
x3_test = test["IsHoliday"]
y_test = test["Weekly_Sales"]
m_test=x1_test.size
print(x1_test.shape)
print(x2_test.shape)
print(x3_test.shape)

(84174,)
(84174,)
(84174,)


In [12]:
# Convert our variables datatype from series to array :

x1_test = np.array(x1_test)
x2_test = np.array(x2_test)
x3_test = np.array(x3_test)

x1_test = x1_test
x2_test = np.power(x2_test,2)
x3_test = np.power(x3_test,3)

y_test = np.array(y_test)

In [18]:
# Create a “ones” matrix :
x_bias = np.ones((m_test,1))

# Reshape our data so that we can perform operations like addition and multiplication with x_bias

x1_new_test = np.reshape(x1_test,(m_test,1))
x2_new_test = np.reshape(x2_test,(m_test,1))
x3_new_test = np.reshape(x3_test,(m_test,1))


#Create a major matrix with all the columns:

x_new_test = np.append(x_bias,x1_new_test,axis=1)
x_new_test = np.append(x_new_test,x2_new_test,axis=1)
x_new_test = np.append(x_new_test,x3_new_test,axis=1)


0.0


In [14]:
y_predict = np.zeros(m_test)
for i in range(m_test):
    sum = 0
    for n in range(3):
        sum += theta[n] * x_new_test[i][n]
    y_predict[i] = sum
print(y_predict)


print(y_test)

[14846.62202989 14846.62202989 14846.62202989 ... 22761.89835457
 22761.89835457 22761.89835457]
[42960 14773 16637 ...   605   467   760]


In [19]:
#evaluation
def WMAE(X_test, y_predict, y_test):
    sum = 0
    W = 0
    for i in range(m_test):
        if X_test[i][3] == 0:
            sum += abs(y_predict[i]-y_test[i])
            W += 1
        else:
            sum += 5 * abs(y_predict[i]-y_test[i])
            W += 5
    return sum / W

print(WMAE(x_new_test,y_predict,y_test))

14994.01896558244
