In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime as dt

import warnings
warnings.filterwarnings("ignore")

#### Loading the Walmart Sales Dataset

In [38]:
sales = pd.read_csv("Walmart_Sales.csv")

In [39]:
sales.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


#### We first check if we have any null or na values to fill out.
#### Looks like we do not need to worry about any null values in our dataset.

In [40]:
sales.isna().sum()

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64

#### Let us take a look at the datatypes of attributes we are dealing with.

In [11]:
sales.dtypes

Store             int64
Date             object
Weekly_Sales    float64
Holiday_Flag      int64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
dtype: object

#### Let us use the describe() method to get a statistical summary of the data.<br>No abnormally large or small, out of range values can be noticed.

In [41]:
sales.describe()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,23.0,1046965.0,0.06993,60.663782,3.358607,171.578394,7.999151
std,12.988182,564366.6,0.255049,18.444933,0.45902,39.356712,1.875885
min,1.0,209986.2,0.0,-2.06,2.472,126.064,3.879
25%,12.0,553350.1,0.0,47.46,2.933,131.735,6.891
50%,23.0,960746.0,0.0,62.67,3.445,182.616521,7.874
75%,34.0,1420159.0,0.0,74.94,3.735,212.743293,8.622
max,45.0,3818686.0,1.0,100.14,4.468,227.232807,14.313


### Preliminary Data Modfications

#### We will set a proper datetime format for the date attribute.

In [43]:
sales_df = sales.copy()
dates = np.array(sales.Date)
day, month, year = [], [], []
for i in dates:
    date = dt.strptime(i, "%d-%m-%Y")
    day.append(date.day)
    month.append(date.month)
    year.append(date.year)
sales_df.insert(1, "Day", day)
sales_df.insert(2, "Month", month)
sales_df.insert(3, "Year", year)
sales_df.drop(["Date"], 1)

Unnamed: 0,Store,Day,Month,Year,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,5,2,2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12,2,2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19,2,2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26,2,2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,5,3,2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...,...,...
6430,45,28,9,2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,5,10,2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12,10,2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19,10,2012,718125.53,0,56.47,3.969,192.330854,8.667


list