# Took Walmart Sales data from Kaggle for day 1 project

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Walmart_Sales.csv")

In [3]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [4]:
df.shape

(6435, 8)

In [5]:
df.isnull().any()

Store           False
Date            False
Weekly_Sales    False
Holiday_Flag    False
Temperature     False
Fuel_Price      False
CPI             False
Unemployment    False
dtype: bool

# Separating the columns into X and Y

In [6]:
y = df['Weekly_Sales']

In [23]:
X = df.drop(columns=['Weekly_Sales','Date'])

In [24]:
y.shape

(6435,)

In [25]:
X.shape

(6435, 6)

# Splitting the dataset into Train and Test data

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state= 45)

In [28]:
X_train.shape

(5148, 6)

In [29]:
y_train.shape

(5148,)

In [30]:
X_test.shape

(1287, 6)

## Using Support Vector Regression Model 
Usually SVM is used for classification problem as it create a hyperplane that seperates the classifiers when they are plotted against features

In [31]:
from sklearn import svm

In [32]:
svm_model = svm.SVR()

In [33]:
svm_model.fit(X_train,y_train)

SVR()

In [34]:
svm_prediction=svm_model.predict(X_test)

In [35]:
from sklearn.metrics import mean_absolute_error

In [36]:
svm_mae = mean_absolute_error(y_test,svm_prediction)

In [37]:
svm_mae

454635.73960689927

# Very high mean absolute error. Will try Decision Tree Regressor Model 

A regression tree is basically a decision tree that is used for the task of regression which can be used to predict continuous valued outputs instead of discrete outputs. It selects the region on the basis of variance reduction condition. 

In [39]:
from sklearn.tree import DecisionTreeRegressor

In [40]:
dt_model=DecisionTreeRegressor()

In [41]:
dt_model.fit(X_train,y_train)

DecisionTreeRegressor()

In [42]:
dt_prediction= dt_model.predict(X_test)

In [43]:
dt_mae= mean_absolute_error(y_test,dt_prediction)

In [44]:
dt_mae

96374.34406371406

We will do Standardisation & Normalisation to features through StandardScaler

In [45]:
from sklearn.preprocessing import StandardScaler

In [46]:
std_scaler = StandardScaler()

In [48]:
scaled_X_train = pd.DataFrame(std_scaler.fit_transform(X_train),columns=X_train.columns, index=X_train.index)

In [49]:
scaled_X_test = pd.DataFrame(std_scaler.fit_transform(X_test),columns=X_test.columns, index=X_test.index)

In [50]:
scaled_X_test

Unnamed: 0,Store,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
1509,-0.898927,-0.269258,1.590550,0.631544,1.157172,-0.213764
3124,-0.070889,-0.269258,0.490199,0.983441,-0.788782,-0.158498
4606,0.757148,-0.269258,1.618750,-0.580300,-1.194113,0.810780
625,-1.350583,3.713907,-1.232184,-0.723258,1.018764,-0.709562
1410,-0.974203,-0.269258,1.497272,1.744418,-1.074405,-0.312073
...,...,...,...,...,...,...
4093,0.456043,-0.269258,-0.090617,0.587557,-0.930777,0.737447
3829,0.305491,-0.269258,-0.199079,1.546476,-0.801003,0.021116
5441,1.208804,-0.269258,-0.333031,-1.361070,0.924889,0.310731
647,-1.350583,-0.269258,1.500526,0.492985,1.080309,-0.765360


In [51]:
svm_model.fit(scaled_X_train,y_train)

SVR()

In [54]:
new_svm_prediction = svm_model.predict(scaled_X_test)

In [55]:
mean_absolute_error(y_test,new_svm_prediction)

454618.02293538436

# Linear Regression

In [59]:
from sklearn.linear_model import LinearRegression

In [60]:
lr_model= LinearRegression()

In [61]:
lr_model.fit(scaled_X_train, y_train)

LinearRegression()

In [62]:
lr_prediction = lr_model.predict(scaled_X_test)

In [63]:
lr_mae = mean_absolute_error(y_test, lr_prediction)

In [64]:
lr_mae

421941.01230603934