# 0.0 Configuration and Data Loading

## 0.1 Imports

In [13]:
import pandas as pd
import math
import inflection
import numpy as np
import seaborn as sns
import datetime

from scipy                             import stats as ss
from matplotlib                        import pyplot as plt
from matplotlib.gridspec               import GridSpec
from IPython.display                   import Image
from IPython.core.display              import HTML
from boruta                            import BorutaPy

from sklearn.preprocessing             import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble                  import RandomForestRegressor

## 0.2 Helper Functions

In [14]:
def jupyter_settings():
    
    %matplotlib inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    sns.set()

In [15]:
jupyter_settings()

## 0.3 Loading Data

In [16]:
df_sales_raw = pd.read_csv( 'data/train.csv', low_memory = False )
df_store_raw = pd.read_csv( 'data/store.csv', low_memory = False )

df_raw = pd.merge(df_sales_raw, df_store_raw, 'left', 'Store')

# 1.0 Data Description

## 1.1 Rename Columns

In [17]:
df1 = df_raw.copy()

In [18]:
df1.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval'],
      dtype='object')

In [19]:
old_cols = ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 
            'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
            'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']

snakecase = lambda x: inflection.underscore( x )

new_cols = list( map( snakecase, old_cols ) )

In [20]:
df1.columns = new_cols

In [21]:
df1.columns

Index(['store', 'day_of_week', 'date', 'sales', 'customers', 'open', 'promo',
       'state_holiday', 'school_holiday', 'store_type', 'assortment',
       'competition_distance', 'competition_open_since_month',
       'competition_open_since_year', 'promo2', 'promo2_since_week',
       'promo2_since_year', 'promo_interval'],
      dtype='object')

## 1.2 Data Dimentions

In [22]:
print( 'Name of rows: {}'.format( df1.shape[0] ) )
print( 'Name of cols: {}'.format( df1.shape[1] ) )

Name of rows: 1017209
Name of cols: 18


## 1.3 Data Types

In [23]:
df1['date'] = pd.to_datetime( df1['date'] )
df1.dtypes

store                                    int64
day_of_week                              int64
date                            datetime64[ns]
sales                                    int64
customers                                int64
open                                     int64
promo                                    int64
state_holiday                           object
school_holiday                           int64
store_type                              object
assortment                              object
competition_distance                   float64
competition_open_since_month           float64
competition_open_since_year            float64
promo2                                   int64
promo2_since_week                      float64
promo2_since_year                      float64
promo_interval                          object
dtype: object

## 1.4 Check NA

In [24]:
df1.isna().sum()

store                                0
day_of_week                          0
date                                 0
sales                                0
customers                            0
open                                 0
promo                                0
state_holiday                        0
school_holiday                       0
store_type                           0
assortment                           0
competition_distance              2642
competition_open_since_month    323348
competition_open_since_year     323348
promo2                               0
promo2_since_week               508031
promo2_since_year               508031
promo_interval                  508031
dtype: int64