In [1]:
# Importing Important Libraries

import pandas as pd
import pickle
from downcast import reduce
from tqdm import tqdm # for creating progress bars
from sklearn.preprocessing import LabelEncoder

import os
os.chdir('C:\\Users\\91958\\Desktop\\Datasets\\Wallmart Dataset')

import warnings
warnings.filterwarnings("ignore")
import gc
gc.collect()

0

In [2]:
# loading our saved pickle file from disk

with open('final_df.pkl','rb') as f:
    final_df = pickle.load(f)

In [3]:
# checking the data
final_df.head(2)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,no_event,no_event,no_event,no_event,0,0,0,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,no_event,no_event,no_event,no_event,0,0,0,


In [4]:
# function to split day number from feature d

def d_split(input):
    """
    This function is used to extract day number from d feature e.g.,: d_21 => 21
    """
    return input.split('_')[1]


In [5]:
# mapping split function in our d column

final_df['day']=final_df['d'].map(d_split).astype('int16')
# final_df['day'] = final_df['day'].astype('int8')

#### Reducing the datasize due to limited resource available

In [6]:
# considering data starting from day 1000

final_df_ = final_df[final_df['day']>=1400]

# tried to consider day >1000 but was getting memory issue as memory size it took was 6.9GB

In [7]:
final_df_.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,day
42655510,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,2014,no_event,no_event,no_event,no_event,0,0,0,8.257812,1400
42655511,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,2014,no_event,no_event,no_event,no_event,0,0,0,3.970703,1400
42655512,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,2014,no_event,no_event,no_event,no_event,0,0,0,2.970703,1400
42655513,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,3,2014-11-28,11443,...,2014,no_event,no_event,no_event,no_event,0,0,0,4.640625,1400
42655514,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,2014,no_event,no_event,no_event,no_event,0,0,0,2.880859,1400


###### Key Observations:
- Column d has now all int values
- We can observe some null values in sell_price column

In [8]:
# Handling missing value for sell_price feature - mean imputation

final_df_['sell_price'].fillna(final_df_.groupby(['store_id','item_id'])['sell_price'].transform('mean'),
                              inplace=True)

In [9]:
final_df_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16525580 entries, 42655510 to 59181089
Data columns (total 23 columns):
 #   Column         Dtype         
---  ------         -----         
 0   id             category      
 1   item_id        category      
 2   dept_id        category      
 3   cat_id         category      
 4   store_id       category      
 5   state_id       category      
 6   d              object        
 7   quantity sold  int16         
 8   date           datetime64[ns]
 9   wm_yr_wk       int16         
 10  weekday        category      
 11  wday           int8          
 12  month          int8          
 13  year           int16         
 14  event_name_1   category      
 15  event_type_1   category      
 16  event_name_2   category      
 17  event_type_2   category      
 18  snap_CA        int8          
 19  snap_TX        int8          
 20  snap_WI        int8          
 21  sell_price     float16       
 22  day            int16         
dty

In [10]:
set(final_df_['year'])

{2014, 2015, 2016}

###### Key Observations:
- We are considering 3 years of past data now

### 1. Time Based Features

- While doing EDA we found that there is a trend in sales
    - Sales are high on weekend i.e, saturday and sunday 
    - Sales are high on Feb, March, April and May
    - On Christmas sales goes to Zero
    

##### a. Creating new features based on our EDA

In [11]:
# feature 1 name : if_weekend

weekend = ['saturday','sunday']
def if_weekend(val):
    """ This function
    returns 1 : for saturday and sunday
    returns 0 : otherwise
    """
    if val in weekend:
        return 1
    else:
        return 0

    
# feature 2 name: if_month_seasonality
months = [2,3,4,5]
def if_month_season(val):
    """ This function
    returns 1 : for February, March, April and May month
    returns 0 : otherwise
    """
    if val in months:
        return 1
    else:
        return 0


# feature 3 name: if_christmas
def if_christmas(val):
    if val == 'Christmas':
        return 1
    else:
        return 0
    
    

In [12]:
final_df_['if_weekend'] = final_df_['weekday'].map(if_weekend)
final_df_['if_month_season'] = final_df_['month'].map(if_month_season)
final_df_['if_christmas'] = final_df_['event_name_2'].map(if_christmas)

In [13]:
final_df_.head(2)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,day,if_weekend,if_month_season,if_christmas
42655510,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,no_event,no_event,0,0,0,8.257812,1400,0,0,0
42655511,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,no_event,no_event,0,0,0,3.970703,1400,0,0,0


### 2. Label Encoding 

In [14]:
label_encode = LabelEncoder()

# list with all the categorical features
category =['id','item_id','dept_id','cat_id','store_id','state_id','event_name_1','event_type_1','event_name_2','event_type_2']
for cat in tqdm(category):
    final_df_[cat+'_label'] = label_encode.fit_transform(final_df_[cat])

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.64s/it]


In [15]:
final_df_.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,id_label,item_id_label,dept_id_label,cat_id_label,store_id_label,state_id_label,event_name_1_label,event_type_1_label,event_name_2_label,event_type_2_label
59181085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-22,11617,...,14329,1432,2,0,9,2,30,4,0,0
59181086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,14339,1433,2,0,9,2,30,4,0,0
59181087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2,2016-05-22,11617,...,14349,1434,2,0,9,2,30,4,0,0
59181088,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,14359,1435,2,0,9,2,30,4,0,0
59181089,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-22,11617,...,14369,1436,2,0,9,2,30,4,0,0


In [16]:
# downcasting the dataframe to reduce memory usage

final_df_ = reduce(final_df_)

In [17]:
final = final_df_.reset_index(drop=True)

In [18]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16525580 entries, 0 to 16525579
Data columns (total 36 columns):
 #   Column              Dtype         
---  ------              -----         
 0   id                  category      
 1   item_id             category      
 2   dept_id             category      
 3   cat_id              category      
 4   store_id            category      
 5   state_id            category      
 6   d                   category      
 7   quantity sold       int16         
 8   date                datetime64[ns]
 9   wm_yr_wk            int16         
 10  weekday             category      
 11  wday                int8          
 12  month               int8          
 13  year                int16         
 14  event_name_1        category      
 15  event_type_1        category      
 16  event_name_2        category      
 17  event_type_2        category      
 18  snap_CA             int8          
 19  snap_TX             int8          
 20  

In [19]:
# final.to_pickle("final_Encoded.pkl")

### 3. Lag Features

In [53]:
# https://www.analyticsvidhya.com/blog/2019/12/6-powerful-feature-engineering-techniques-time-series/
# Feature engineering ideas for time series problem 

- in EDA i found that we cant found periodicity in month or in year but we can find the periodicity in weeks.in that concern to capture periodic info "lag" is one technique to capture periodicity.
- not forget we are going to take lags of target variable. which is "quantity sold"
- lets gather the data into single dataFrame.

######  Features Name: [ 'lag_1', 'lag_7', 'lag_14' , 'lag_21' , 'lag_28' , 'lag_35' , 'lag_42' , 'lag_49' , 'lag_56' ]

In [54]:
lags = [1,7,14,21,28,35,42,49,56]
for i in tqdm(lags):
    final_df_['lag_'+str(i)] = final_df_.groupby(['id'])['quantity sold'].shift(i)


100%|██████████| 9/9 [00:04<00:00,  2.15it/s]


In [55]:
final_df_.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,if_christmas,lag_1,lag_7,lag_14,lag_21,lag_28,lag_35,lag_42,lag_49,lag_56
42655510,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,,,,,,,,,
42655511,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,,,,,,,,,
42655512,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,,,,,,,,,
42655513,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,3,2014-11-28,11443,...,0,,,,,,,,,
42655514,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,,,,,,,,,
42655515,HOBBIES_1_006_CA_1_evaluation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,1,2014-11-28,11443,...,0,,,,,,,,,
42655516,HOBBIES_1_007_CA_1_evaluation,HOBBIES_1_007,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,,,,,,,,,
42655517,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,5,2014-11-28,11443,...,0,,,,,,,,,
42655518,HOBBIES_1_009_CA_1_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,,,,,,,,,
42655519,HOBBIES_1_010_CA_1_evaluation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,1,2014-11-28,11443,...,0,,,,,,,,,


###### Key Observations:
- New features have been generated 
- For these new features we can see lots of Null values

In [56]:
# filling null values with 0 

for col in final_df_.columns:
    if 'lag_' in col:
        final_df_[col]=final_df_[col].fillna(0)

In [57]:
final_df_.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,if_christmas,lag_1,lag_7,lag_14,lag_21,lag_28,lag_35,lag_42,lag_49,lag_56
42655510,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42655511,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42655512,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42655513,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,3,2014-11-28,11443,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42655514,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Rolling Window Feature

- Rolling is a very useful operation for time series data.

- Rolling means creating a rolling window with a specified size & perform calculations on data in this window which of course rolls through data

- Here we have computing Rolling-Mean on 'quantity sold' column.

- The maximum Window size taken is 42

##### Features Name: [ ' rolling mean 7',' rolling mean 14 ',' rolling mean 28 ',' rolling mean 35 ', ' rolling mean 42 ']

In [58]:
# https://pandas.pydata.org/docs/reference/api/pandas.Series.rolling.html
# #https://stackoverflow.com/questions/13996302/python-rolling-functions-for-groupby-object
##https://www.geeksforgeeks.org/python-pandas-dataframe-transform/

window = [7,14,28,35,42]

for i in tqdm(window):
    func = lambda x: x.rolling(i).median()
    final_df_['rolling median_'+str(i)] = final_df_.groupby(['id'])['quantity sold'].transform(func)


100%|██████████| 5/5 [01:50<00:00, 22.05s/it]


### Expanding Window Feature
- Advance version of rolling window
- Takes all the past values into consideration

In [59]:
func = lambda x: x.expanding().median()
final_df_['expanding median'] = final_df_.groupby(['id'])['quantity sold'].transform(func)

In [60]:
final_df_.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,lag_35,lag_42,lag_49,lag_56,rolling median_7,rolling median_14,rolling median_28,rolling median_35,rolling median_42,expanding median
42655510,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,0.0
42655511,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,0.0
42655512,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,0.0
42655513,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,3,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,3.0
42655514,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,0.0
42655515,HOBBIES_1_006_CA_1_evaluation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,1,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,1.0
42655516,HOBBIES_1_007_CA_1_evaluation,HOBBIES_1_007,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,0.0
42655517,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,5,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,5.0
42655518,HOBBIES_1_009_CA_1_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,0.0
42655519,HOBBIES_1_010_CA_1_evaluation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,1,2014-11-28,11443,...,0.0,0.0,0.0,0.0,,,,,,1.0


###### Key Observation:
- We can see lots of null values for rolling median, because of 0 sales for some days and we are taking median of it


In [61]:
# removing null values for rolling window and Expanding feature by replacing it with 0

cat = ['rolling median_7','rolling median_14','rolling median_28','rolling median_35','rolling median_42','expanding median']
final_df_[cat] = final_df_[cat].fillna(0)

In [62]:
final_df_.tail(12)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,lag_35,lag_42,lag_49,lag_56,rolling median_7,rolling median_14,rolling median_28,rolling median_35,rolling median_42,expanding median
59181078,FOODS_3_816_WI_3_evaluation,FOODS_3_816,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,0.0,0.0,11.0,6.0,0.0,0.0,0.0,0.0,0.0,5.0
59181079,FOODS_3_817_WI_3_evaluation,FOODS_3_817,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-22,11617,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59181080,FOODS_3_818_WI_3_evaluation,FOODS_3_818,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,2.0,4.0,0.0,0.0,2.0,3.0,2.0,2.0,2.0,1.0
59181081,FOODS_3_819_WI_3_evaluation,FOODS_3_819,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-22,11617,...,0.0,3.0,3.0,1.0,4.0,1.5,2.0,2.0,2.0,1.0
59181082,FOODS_3_820_WI_3_evaluation,FOODS_3_820,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-22,11617,...,0.0,2.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
59181083,FOODS_3_821_WI_3_evaluation,FOODS_3_821,FOODS_3,FOODS,WI_3,WI,d_1941,4,2016-05-22,11617,...,1.0,4.0,1.0,1.0,1.0,1.0,1.0,0.0,0.5,1.0
59181084,FOODS_3_822_WI_3_evaluation,FOODS_3_822,FOODS_3,FOODS,WI_3,WI,d_1941,4,2016-05-22,11617,...,2.0,4.0,3.0,5.0,2.0,2.0,2.0,2.0,2.0,2.0
59181085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-22,11617,...,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0
59181086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59181087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2,2016-05-22,11617,...,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,0.0


### DataFrame Cleaning
- As we have already encoded our categorical features, so we can drop old one
- weekday as a feature can be removed as wday is already present
- Date as a feature can also be removed as we have month and year column also present


In [38]:
# features to be dropped

unused_columns =['id','item_id','dept_id','cat_id','store_id','state_id','d','event_name_1','event_type_1','event_name_2','event_type_2','date','weekday']
final_df_ = final_df_.drop(unused_columns, axis=1)

In [39]:
final_df_.columns

Index(['quantity sold', 'wm_yr_wk', 'wday', 'month', 'year', 'snap_CA',
       'snap_TX', 'snap_WI', 'sell_price', 'day', 'if_weekend',
       'if_month_season', 'if_christmas', 'id_label', 'item_id_label',
       'dept_id_label', 'cat_id_label', 'store_id_label', 'state_id_label',
       'event_name_1_label', 'event_type_1_label', 'event_name_2_label',
       'event_type_2_label'],
      dtype='object')

In [40]:
final_df_.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 16525580 entries, 42655510 to 59181089
Data columns (total 23 columns):
 #   Column              Dtype  
---  ------              -----  
 0   quantity sold       int16  
 1   wm_yr_wk            int16  
 2   wday                int8   
 3   month               int8   
 4   year                int16  
 5   snap_CA             int8   
 6   snap_TX             int8   
 7   snap_WI             int8   
 8   sell_price          float16
 9   day                 int16  
 10  if_weekend          int64  
 11  if_month_season     int64  
 12  if_christmas        int64  
 13  id_label            int32  
 14  item_id_label       int32  
 15  dept_id_label       int32  
 16  cat_id_label        int32  
 17  store_id_label      int32  
 18  state_id_label      int32  
 19  event_name_1_label  int32  
 20  event_type_1_label  int32  
 21  event_name_2_label  int32  
 22  event_type_2_label  int32  
dtypes: float16(1), int16(4), int32(10), int64(3), in

In [70]:
# downcasting the dataframe to reduce memory usage

final = reduce(final)

In [73]:
# final = final_df_.reset_index(drop=True)

In [74]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16525580 entries, 0 to 16525579
Data columns (total 20 columns):
 #   Column              Dtype  
---  ------              -----  
 0   quantity sold       int16  
 1   wm_yr_wk            int16  
 2   wday                int8   
 3   month               int8   
 4   year                int16  
 5   snap_CA             int8   
 6   snap_TX             int8   
 7   snap_WI             int8   
 8   sell_price          float16
 9   day                 int16  
 10  id_label            int16  
 11  item_id_label       int16  
 12  dept_id_label       int8   
 13  cat_id_label        int8   
 14  store_id_label      int8   
 15  state_id_label      int8   
 16  event_name_1_label  int8   
 17  event_type_1_label  int8   
 18  event_name_2_label  int8   
 19  event_type_2_label  int8   
dtypes: float16(1), int16(6), int8(13)
memory usage: 425.5 MB


###### Key Observations:
- Memory usage has been reduce from 3.2 GB to 425.5 MB, approx 80 %

In [None]:
# saving the data to disk
# final.to_csv('final.csv')

In [None]:
# final.to_pickle('final.pkl')