## Exploratory Data Analysis

## Setup

In [1]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

## Import Data

In [2]:
holidays = pd.read_csv("../../datasets/holidays_events.csv")
oil = pd.read_csv("../../datasets/oil.csv")
stores = pd.read_csv("../../datasets/stores.csv")
transactions = pd.read_csv("../../datasets/transactions.csv") 

train = pd.read_csv("../../datasets/train.csv")
test = pd.read_csv("../../datasets/test.csv")


holidays["date"] = pd.to_datetime(holidays["date"])
oil["date"] = pd.to_datetime(oil["date"])
transactions["date"] = pd.to_datetime(transactions["date"])

train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

### Oil 

In [3]:
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [4]:
oil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        1218 non-null   datetime64[ns]
 1   dcoilwtico  1175 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 19.2 KB


In [5]:
pd.date_range(start="2013-01-01", end="2017-08-15")

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', length=1688, freq='D')

#### Interpolation

In [6]:
oil_interpolated = pd.DataFrame()
oil_interpolated["date"] = pd.date_range(start="2013-01-01", end="2017-08-15")
oil_interpolated = oil_interpolated.merge(oil, on="date", how="left")
oil_interpolated["dcoilwtico_interpolated"] = oil_interpolated["dcoilwtico"].interpolate()

In [7]:
px.line(oil_interpolated, x="date", y=["dcoilwtico_interpolated", "dcoilwtico"])

In [8]:
temp = oil_interpolated.merge(transactions.groupby("date")["transactions"].sum(), on="date", how="left")

In [9]:
temp.corr()["transactions"]

date                       0.258037
dcoilwtico                -0.303674
dcoilwtico_interpolated   -0.244957
transactions               1.000000
Name: transactions, dtype: float64

In [10]:
train.groupby("date")["sales"].sum()

date
2013-01-01      2511.618999
2013-01-02    496092.417944
2013-01-03    361461.231124
2013-01-04    354459.677093
2013-01-05    477350.121229
                  ...      
2017-08-11    826373.722022
2017-08-12    792630.535079
2017-08-13    865639.677471
2017-08-14    760922.406081
2017-08-15    762661.935939
Name: sales, Length: 1684, dtype: float64

In [11]:
temp.merge(train.groupby("date")["sales"].sum(), on="date").corr()

Unnamed: 0,date,dcoilwtico,dcoilwtico_interpolated,transactions,sales
date,1.0,-0.839544,-0.838776,0.258037,0.718394
dcoilwtico,-0.839544,1.0,1.0,-0.303674,-0.705002
dcoilwtico_interpolated,-0.838776,1.0,1.0,-0.244957,-0.627968
transactions,0.258037,-0.303674,-0.244957,1.0,0.676509
sales,0.718394,-0.705002,-0.627968,0.676509,1.0


#### Results of hypothesis testing:
##### Hypotheses Tested:

1. __Interpolation of oil prices and testing the correlation with target/transactions__ 

    Oil prices after interpolation show good correlation with sales level
