## Data wrangling - 'test' dataset

In [22]:
import pandas as pd
from pandas import DataFrame

#### Read dataset from file

In [23]:
test: DataFrame = pd.read_csv('./data/test.csv')

#### Split Store_Dept to separate attributes

In [24]:
store_dept = test['Store_Dept'].str.split(pat='_', expand=True)
test['Store'] = store_dept[0]
test['Dept'] = store_dept[1]

#### Split Month & Year from Date

In [25]:
test['Date'] = pd.to_datetime(test['Date'], format='%m/%d/%y')
test['Year'] = test['Date'].map(lambda x: x.year)
test['Month'] = test['Date'].map(lambda x: x.month)

#### Calculate if the row has anyone of the markdowns populated

In [26]:
no_markdowns = test['MarkDown1'].isna() & test['MarkDown2'].isna() \
        & test['MarkDown3'].isna() & test['MarkDown4'].isna() \
        & test['MarkDown5'].isna()
test['HasMarkDown'] = (no_markdowns == False)

#### Clearing NA values

In [27]:
cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
for col in cols:
    test[col] = test[col].fillna(0)

#### Datatype conversion

In [28]:
# master['HasMarkDown'] = master['HasMarkDown'].map(lambda x: 'Y' if x == True else 'N')
# master['IsHoliday'] = master['IsHoliday'].map(lambda x: 'Y' if x == True else 'N')

cols = ['Store', 'Dept', 'Type', 'IsHoliday', 'HasMarkDown', 'Year', 'Month']
for col in cols:
    test[col] = pd.Categorical(test[col])

test = test.drop(columns='Store_Dept')
test.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76903 entries, 0 to 76902
Data columns (total 19 columns):
Weekly_Sales    76903 non-null float64
Date            76903 non-null datetime64[ns]
IsHoliday       76903 non-null category
Temperature     76903 non-null float64
Fuel_Price      76903 non-null float64
MarkDown1       76903 non-null float64
MarkDown2       76903 non-null float64
MarkDown3       76903 non-null float64
MarkDown4       76903 non-null float64
MarkDown5       76903 non-null float64
CPI             76903 non-null float64
Unemployment    76903 non-null float64
Type            76903 non-null category
Size            76903 non-null int64
Store           76903 non-null category
Dept            76903 non-null category
Year            76903 non-null category
Month           76903 non-null category
HasMarkDown     76903 non-null category
dtypes: category(7), datetime64[ns](1), float64(10), int64(1)
memory usage: 7.6 MB


#### Dummy coding

In [29]:
types = pd.get_dummies(pd.DataFrame({'Type': test['Type']}))
test = pd.concat([test, types], axis=1)

is_holiday = pd.get_dummies(pd.DataFrame({'IsHoliday': test['IsHoliday']}))
test = pd.concat([test, is_holiday], axis=1)

has_markdown = pd.get_dummies(pd.DataFrame({'HasMarkDown': test['HasMarkDown']}))
test = pd.concat([test, has_markdown], axis=1)

month = pd.get_dummies(pd.DataFrame({'Month': test['Month']}))
test = pd.concat([test, month], axis=1)

dept = pd.get_dummies(pd.DataFrame({'Dept': test['Dept']}))
test = pd.concat([test, dept], axis=1)

test['HasMarkDown_False'] = 0

test.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76903 entries, 0 to 76902
Data columns (total 113 columns):
Weekly_Sales         float64
Date                 datetime64[ns]
IsHoliday            category
Temperature          float64
Fuel_Price           float64
MarkDown1            float64
MarkDown2            float64
MarkDown3            float64
MarkDown4            float64
MarkDown5            float64
CPI                  float64
Unemployment         float64
Type                 category
Size                 int64
Store                category
Dept                 category
Year                 category
Month                category
HasMarkDown          category
Type_A               uint8
Type_B               uint8
Type_C               uint8
IsHoliday_False      uint8
IsHoliday_True       uint8
HasMarkDown_True     uint8
Month_5              uint8
Month_6              uint8
Month_7              uint8
Month_8              uint8
Month_9              uint8
Month_10             uint8
Dep

In [30]:
test.to_pickle('./data/test.pickle')
print('DataFrame stored in pickle file...')


DataFrame stored in pickle file...
