In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
from IPython.display import Image
import random

import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
data=pd.read_csv(r"/content/gdrive/MyDrive/PredictiveAnalyticsData/sales_train_evaluation.csv")
for i in range(1942,1942+28): #adding last 28 days as zero sales which we need to predicy it
    data['d_'+str(i)]=0

data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
calender=pd.read_csv(r"/content/gdrive/MyDrive/PredictiveAnalyticsData/calendar.csv")
calender=calender.fillna('no_event')

calender.head(4)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,no_event,no_event,no_event,no_event,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,no_event,no_event,no_event,no_event,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,no_event,no_event,no_event,no_event,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,no_event,no_event,no_event,no_event,1,1,0


### Add holiday and weekend features

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

holidays=calendar().holidays(start=calender.date.min(), end=calender.date.max())

def is_holiday(x):
    if x in holidays:
        return 1
    else:
        return 0


weekends= ['Saturday', 'Sunday']
def if_weekends(x):
    if x in weekends:
        return 1
    else:
        return 0

In [None]:
calender['holiday']=calender['date'].apply(is_holiday)
calender['if_weekend']=calender['weekday'].apply(if_weekends)
calender.drop(['weekday','date'], axis=1, inplace=True)
calender.head()

Unnamed: 0,wm_yr_wk,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,holiday,if_weekend
0,11101,1,1,2011,d_1,no_event,no_event,no_event,no_event,0,0,0,0,1
1,11101,2,1,2011,d_2,no_event,no_event,no_event,no_event,0,0,0,0,1
2,11101,3,1,2011,d_3,no_event,no_event,no_event,no_event,0,0,0,0,0
3,11101,4,2,2011,d_4,no_event,no_event,no_event,no_event,1,1,0,0,0
4,11101,5,2,2011,d_5,no_event,no_event,no_event,no_event,1,0,1,0,0


In [None]:
sell_price=pd.read_csv(r"/content/gdrive/MyDrive/PredictiveAnalyticsData/sell_prices.csv")
sell_price.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [None]:
df = sell_price.copy()

In [None]:
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()

In [None]:
for i,t in enumerate(types):
  print(f"col name = {cols[i]} and type = {t}")

col name = store_id and type = object
col name = item_id and type = object
col name = wm_yr_wk and type = int64
col name = sell_price and type = float64


In [None]:
# downcast the dataframes to reduce memory usage.
def downcast(df):
    print(f"Memory occupied before downcast: {str(round(df.memory_usage(index=False, deep=True).sum()/(1024*1024),2))} MB")
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            # check the optimum range of column values to downcast to appropriate int-range value column
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            # check the optimum range of column values to downcast to appropriate float-range value column
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == 'object':
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    print(f"Memory occupied after downcast: {str(round(df.memory_usage(index=False, deep=True).sum()/(1024*1024),2))} MB\n")
    return df

data= downcast(data)
prices = downcast(sell_price)
calender = downcast(calender)

Memory occupied before downcast: 469.8 MB
Memory occupied after downcast: 99.55 MB

Memory occupied before downcast: 957.52 MB
Memory occupied after downcast: 45.94 MB

Memory occupied before downcast: 0.74 MB
Memory occupied after downcast: 0.22 MB



Melt and Merge with calendar and sales price dtaframes

In [None]:
#for each item making each row for sales happened on each day
#i.e making time series for day i.e day 1 to day 19..
def convert_single_sales_feature(input):
    melt = pd.melt(input,
                id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                var_name='d', value_name='sales')
    return melt

#merge iwth calendars
def merge_with_cal(input):
    value = pd.merge(input, calender, on='d', how='left')
    return value


#merging with price
def merge_with_price(input):
    value=pd.merge(input, prices, on=['store_id','item_id','wm_yr_wk'], how='left')
    return value

In [None]:
final = merge_with_price(merge_with_cal(convert_single_sales_feature(data)))
# final

In [None]:
final['d'][0].split('_')[1]

'1'

In [None]:
def convert(input):
    return input.split("_")[1]

final['d']=final['d'].apply(convert)

# convert dtype to 'int16' as the max days is < 2000 which fits between the range of min(int16)-max(int16)
final['d']=final['d'].astype('int16')

In [None]:
max(final['d'])
# final[final['d']>1500].head()

1969

In [None]:
# Total days for which data available: 1942
# Training data: Day 1-1914
# Testing data: Day 1915-1942
# Prediction data: Day 1943-1970


Processing "snap_..." columns.

In [None]:
# create a single column representative of whether the product was a part of SNAP program
for state in ['CA', 'TX', 'WI']:
  final.loc[final['state_id'] == state, 'snap'] = final.loc[final['state_id'] == state][f'snap_{state}']

# dropping original three columns
final.drop(['snap_CA','snap_TX','snap_WI'],axis=1,inplace=True)
final['snap'] = final['snap'].astype('int8')

In [None]:
def get_week_num(x):
  return int(str(x)[-2:])

final['week_num'] = final['wm_yr_wk'].apply(get_week_num)
final['week_num'] = final['week_num'].astype('int8')

In [None]:
final = final[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'week_num', 'wday', 'month', 'year', 'event_name_1',
       'event_type_1', 'event_name_2', 'event_type_2', 'holiday', 'if_weekend',
       'sell_price', 'snap']]

In [None]:
final['week_num'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53], dtype=int8)

In [None]:
final.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,week_num,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,holiday,if_weekend,sell_price,snap
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,,0


In [None]:
# fill the null values of sales price for a day by mean sale price of the item across all days in that store.
final['sell_price'].fillna(final.groupby(['store_id','item_id'])['sell_price'].transform('mean'),inplace=True)
final.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,week_num,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,holiday,if_weekend,sell_price,snap
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,8.283427,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,3.970703,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,2.970703,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,4.528493,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0,1,1,1,2011,no_event,no_event,no_event,no_event,0,1,2.941555,0


In [None]:
for i in final.columns:
    if str(final[i].dtype)=='category':
        final[i] = final[i].cat.codes

In [None]:
final.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,week_num,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,holiday,if_weekend,sell_price,snap
0,14370,1437,3,1,0,0,1,0,1,1,1,2011,30,4,4,2,0,1,8.283427,0
1,14380,1438,3,1,0,0,1,0,1,1,1,2011,30,4,4,2,0,1,3.970703,0
2,14390,1439,3,1,0,0,1,0,1,1,1,2011,30,4,4,2,0,1,2.970703,0
3,14400,1440,3,1,0,0,1,0,1,1,1,2011,30,4,4,2,0,1,4.528493,0
4,14410,1441,3,1,0,0,1,0,1,1,1,2011,30,4,4,2,0,1,2.941555,0


In [None]:
# final.to_csv("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_feature_engineered.csv", index=True)

In [None]:
final.to_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_feature_engineered.pkl")

### Adding Time Series Features

##### 1. Adding lag features.

In [None]:
lags = [7,14,21,28,35,42,49,56]
for lag in lags:
    final['sold_lag_'+str(lag)] = final.groupby(['item_id', 'store_id'],as_index=False)['sales'].shift(lag).astype(np.float16)


##### 2. Adding rolling mean

In [None]:
final['roll_mean_7'] = final.groupby(['item_id','store_id'])['sales'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
final['roll_mean_14'] = final.groupby(['item_id','store_id'])['sales'].transform(lambda x: x.rolling(window=14).mean()).astype(np.float16)
final['roll_mean_28'] = final.groupby(['item_id','store_id'])['sales'].transform(lambda x: x.rolling(window=28).mean()).astype(np.float16)

##### 3. Adding features based on sold units

In [None]:
def mean_sold(input,target):
    final[target]=final.groupby(input)['sales'].transform('mean').astype(np.float16)

mean_sold('item_id','item_sold_avg')
mean_sold('store_id','avg_sold_storewise')
mean_sold('state_id','avg_sold_statewise')
mean_sold('cat_id','avg_sold_categorywise')
mean_sold(['item_id','store_id'],'avg_sold_store_item')
mean_sold(['item_id','state_id'],'avg_sold_state_item')

#'item_sold_avg','avg_sold_storewis','avg_sold_statewis','avg_sold_catwis','avg_sold_store_item','avg_sold_state_item',

In [None]:
def mean_price(input,target):
    final[target]=final.groupby(input)['sell_price'].transform('mean').astype(np.float16)

mean_price('item_id','item_price_avg')
mean_price('store_id','avg_pric_storewis')
mean_price('state_id','avg_pri_statewis')
mean_price('cat_id','avg_price_catwis')
mean_price(['item_id','store_id'],'avg_price_store_item')
mean_price(['item_id','state_id'],'avg_price_state_item')

# 'item_sold_avg','avg_sold_storewis','avg_sold_statewis','avg_sold_catwis','avg_sold_store_item','avg_sold_state_item','item_price_avg','avg_pric_storewis','avg_pri_statewis','avg_price_catwis','avg_price_store_item','avg_price_state_item'


In [None]:
final.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,week_num,wday,...,avg_sold_statewise,avg_sold_categorywise,avg_sold_store_item,avg_sold_state_item,item_price_avg,avg_pric_storewis,avg_pri_statewis,avg_price_catwis,avg_price_store_item,avg_price_state_item
0,14370,1437,3,1,0,0,1,0,1,1,...,1.21582,0.561035,0.321533,0.335938,8.296875,4.460938,4.453125,5.554688,8.28125,8.273438
1,14380,1438,3,1,0,0,1,0,1,1,...,1.21582,0.561035,0.253906,0.204346,3.96875,4.460938,4.453125,5.554688,3.970703,3.964844
2,14390,1439,3,1,0,0,1,0,1,1,...,1.21582,0.561035,0.156982,0.098938,2.966797,4.460938,4.453125,5.554688,2.970703,2.970703
3,14400,1440,3,1,0,0,1,0,1,1,...,1.21582,0.561035,1.694336,2.935547,4.503906,4.460938,4.453125,5.554688,4.527344,4.527344
4,14410,1441,3,1,0,0,1,0,1,1,...,1.21582,0.561035,0.958984,0.96875,2.882812,4.460938,4.453125,5.554688,2.941406,2.935547


### Splitting into train, test, and prediction dataset.

In [None]:
# Total days for which data available: 1941
# Training data: Day 1-1913
# Testing data: Day 1914-1941
# Prediction data: Day 1942-1970


In [None]:
train_data = final[final['d'] <= 1913]
testing_data = final[(final['d'] > 1913) & (final['d'] <= 1941)]
prediction_data = final[(final['d'] > 1941) & (final['d'] <= 1970)]

In [None]:
train_data

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,week_num,wday,...,avg_sold_statewise,avg_sold_categorywise,avg_sold_store_item,avg_sold_state_item,item_price_avg,avg_pric_storewis,avg_pri_statewis,avg_price_catwis,avg_price_store_item,avg_price_state_item
0,14370,1437,3,1,0,0,1,0,1,1,...,1.215820,0.561035,0.321533,0.335938,8.296875,4.460938,4.453125,5.554688,8.281250,8.273438
1,14380,1438,3,1,0,0,1,0,1,1,...,1.215820,0.561035,0.253906,0.204346,3.968750,4.460938,4.453125,5.554688,3.970703,3.964844
2,14390,1439,3,1,0,0,1,0,1,1,...,1.215820,0.561035,0.156982,0.098938,2.966797,4.460938,4.453125,5.554688,2.970703,2.970703
3,14400,1440,3,1,0,0,1,0,1,1,...,1.215820,0.561035,1.694336,2.935547,4.503906,4.460938,4.453125,5.554688,4.527344,4.527344
4,14410,1441,3,1,0,0,1,0,1,1,...,1.215820,0.561035,0.958984,0.968750,2.882812,4.460938,4.453125,5.554688,2.941406,2.935547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58327365,14329,1432,2,0,9,2,1913,1,13,2,...,1.027344,1.623047,0.528320,0.492432,2.794922,4.468750,4.476562,3.294922,2.800781,2.808594
58327366,14339,1433,2,0,9,2,1913,0,13,2,...,1.027344,1.623047,0.369629,0.363525,2.521484,4.468750,4.476562,3.294922,2.507812,2.589844
58327367,14349,1434,2,0,9,2,1913,0,13,2,...,1.027344,1.623047,0.880859,0.630371,4.109375,4.468750,4.476562,3.294922,4.117188,4.117188
58327368,14359,1435,2,0,9,2,1913,3,13,2,...,1.027344,1.623047,0.375244,0.583008,1.280273,4.468750,4.476562,3.294922,1.280273,1.280273


In [None]:
testing_data

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,week_num,wday,...,avg_sold_statewise,avg_sold_categorywise,avg_sold_store_item,avg_sold_state_item,item_price_avg,avg_pric_storewis,avg_pri_statewis,avg_price_catwis,avg_price_store_item,avg_price_state_item
58327370,14370,1437,3,1,0,0,1914,0,13,3,...,1.215820,0.561035,0.321533,0.335938,8.296875,4.460938,4.453125,5.554688,8.281250,8.273438
58327371,14380,1438,3,1,0,0,1914,0,13,3,...,1.215820,0.561035,0.253906,0.204346,3.968750,4.460938,4.453125,5.554688,3.970703,3.964844
58327372,14390,1439,3,1,0,0,1914,0,13,3,...,1.215820,0.561035,0.156982,0.098938,2.966797,4.460938,4.453125,5.554688,2.970703,2.970703
58327373,14400,1440,3,1,0,0,1914,0,13,3,...,1.215820,0.561035,1.694336,2.935547,4.503906,4.460938,4.453125,5.554688,4.527344,4.527344
58327374,14410,1441,3,1,0,0,1914,1,13,3,...,1.215820,0.561035,0.958984,0.968750,2.882812,4.460938,4.453125,5.554688,2.941406,2.935547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59181085,14329,1432,2,0,9,2,1941,1,17,2,...,1.027344,1.623047,0.528320,0.492432,2.794922,4.468750,4.476562,3.294922,2.800781,2.808594
59181086,14339,1433,2,0,9,2,1941,0,17,2,...,1.027344,1.623047,0.369629,0.363525,2.521484,4.468750,4.476562,3.294922,2.507812,2.589844
59181087,14349,1434,2,0,9,2,1941,2,17,2,...,1.027344,1.623047,0.880859,0.630371,4.109375,4.468750,4.476562,3.294922,4.117188,4.117188
59181088,14359,1435,2,0,9,2,1941,0,17,2,...,1.027344,1.623047,0.375244,0.583008,1.280273,4.468750,4.476562,3.294922,1.280273,1.280273


In [None]:
prediction_data

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,week_num,wday,...,avg_sold_statewise,avg_sold_categorywise,avg_sold_store_item,avg_sold_state_item,item_price_avg,avg_pric_storewis,avg_pri_statewis,avg_price_catwis,avg_price_store_item,avg_price_state_item
59181090,14370,1437,3,1,0,0,1942,0,17,3,...,1.215820,0.561035,0.321533,0.335938,8.296875,4.460938,4.453125,5.554688,8.281250,8.273438
59181091,14380,1438,3,1,0,0,1942,0,17,3,...,1.215820,0.561035,0.253906,0.204346,3.968750,4.460938,4.453125,5.554688,3.970703,3.964844
59181092,14390,1439,3,1,0,0,1942,0,17,3,...,1.215820,0.561035,0.156982,0.098938,2.966797,4.460938,4.453125,5.554688,2.970703,2.970703
59181093,14400,1440,3,1,0,0,1942,0,17,3,...,1.215820,0.561035,1.694336,2.935547,4.503906,4.460938,4.453125,5.554688,4.527344,4.527344
59181094,14410,1441,3,1,0,0,1942,0,17,3,...,1.215820,0.561035,0.958984,0.968750,2.882812,4.460938,4.453125,5.554688,2.941406,2.935547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60034805,14329,1432,2,0,9,2,1969,0,21,2,...,1.027344,1.623047,0.528320,0.492432,2.794922,4.468750,4.476562,3.294922,2.800781,2.808594
60034806,14339,1433,2,0,9,2,1969,0,21,2,...,1.027344,1.623047,0.369629,0.363525,2.521484,4.468750,4.476562,3.294922,2.507812,2.589844
60034807,14349,1434,2,0,9,2,1969,0,21,2,...,1.027344,1.623047,0.880859,0.630371,4.109375,4.468750,4.476562,3.294922,4.117188,4.117188
60034808,14359,1435,2,0,9,2,1969,0,21,2,...,1.027344,1.623047,0.375244,0.583008,1.280273,4.468750,4.476562,3.294922,1.280273,1.280273


In [None]:
train_data.to_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_train.pkl")
testing_data.to_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_test.pkl")
prediction_data.to_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_prediction_data.pkl")

In [None]:
len(list(train_data['d'].unique()))

1913

In [None]:
len(list(testing_data['d'].unique()))

28

In [None]:
len(list(prediction_data['d'].unique()))

28