## Deep Learning

We have seen that machine learning using sci-kit learn is taking a lot of time. Hence, we shall go for deep learning using tensorflow.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import *

In [2]:
df_X = pd.read_csv("train_X_preprocessed.csv")
df_y = pd.read_csv("train_y_preprocessed.csv")
df_test = pd.read_csv("test_preprocessed.csv")

In [3]:
df_X.head()

Unnamed: 0,date_year,date_month,date_day,store_nbr_0,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,family_0,...,type_0,type_1,type_2,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,dcoilwtico,holiday?
0,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0
1,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0
2,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0
3,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0
4,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0


In [4]:
df_y.head()

Unnamed: 0,sales
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [5]:
df_test.head()

Unnamed: 0,date_year,date_month,date_day,store_nbr_0,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,family_0,...,type_0,type_1,type_2,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,dcoilwtico,holiday?
0,2017,8,16,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,46.8,0.0
1,2017,8,16,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,46.8,0.0
2,2017,8,16,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,46.8,0.0
3,2017,8,16,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,46.8,0.0
4,2017,8,16,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,46.8,0.0


In [6]:
print("Number of rows in X_train : ",len(df_X))
print("Number of rows in y_train : ",len(df_y))
print("Number of rows in X_test  : ",len(df_test))

Number of rows in X_train :  1048575
Number of rows in y_train :  1048575
Number of rows in X_test  :  28512


In [7]:
df_X.dtypes

date_year        int64
date_month       int64
date_day         int64
store_nbr_0      int64
store_nbr_1      int64
store_nbr_2      int64
store_nbr_3      int64
store_nbr_4      int64
store_nbr_5      int64
family_0         int64
family_1         int64
family_2         int64
family_3         int64
family_4         int64
family_5         int64
onpromotion      int64
city_0           int64
city_1           int64
city_2           int64
city_3           int64
city_4           int64
state_0          int64
state_1          int64
state_2          int64
state_3          int64
type_0           int64
type_1           int64
type_2           int64
cluster_0        int64
cluster_1        int64
cluster_2        int64
cluster_3        int64
cluster_4        int64
dcoilwtico     float64
holiday?       float64
dtype: object

In [8]:
df_test.dtypes

date_year        int64
date_month       int64
date_day         int64
store_nbr_0      int64
store_nbr_1      int64
store_nbr_2      int64
store_nbr_3      int64
store_nbr_4      int64
store_nbr_5      int64
family_0         int64
family_1         int64
family_2         int64
family_3         int64
family_4         int64
family_5         int64
onpromotion      int64
city_0           int64
city_1           int64
city_2           int64
city_3           int64
city_4           int64
state_0          int64
state_1          int64
state_2          int64
state_3          int64
type_0           int64
type_1           int64
type_2           int64
cluster_0        int64
cluster_1        int64
cluster_2        int64
cluster_3        int64
cluster_4        int64
dcoilwtico     float64
holiday?       float64
dtype: object

In [9]:
df_y.dtypes

sales    float64
dtype: object

### Performing feature engineering in both training and testing datasets

In [10]:
df_combined = pd.concat([df_X,df_test])
df_combined.head()

Unnamed: 0,date_year,date_month,date_day,store_nbr_0,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,family_0,...,type_0,type_1,type_2,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,dcoilwtico,holiday?
0,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0
1,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0
2,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0
3,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0
4,2013,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,0,93.366,1.0


In [11]:
print("Length of combined dataframe : ",len(df_combined))

Length of combined dataframe :  1077087


In [12]:
ss = StandardScaler()
df_combined_1 = ss.fit_transform(df_combined)

df_input = df_combined_1[0:len(df_X)]

df_X_train_1 = df_input[0:int(3*len(df_X)/4)]
df_X_valid_1 = df_input[int(3*len(df_X)/4):]
df_X_test_1 = df_combined_1[len(df_X):]

df_X_train_2 = pd.DataFrame(df_X_train_1,columns=df_X.columns.values)
df_X_valid_2 = pd.DataFrame(df_X_valid_1,columns=df_X.columns.values)
df_X_test_2 = pd.DataFrame(df_X_test_1,columns=df_test.columns.values)

In [13]:
df_X_train_2.head()

Unnamed: 0,date_year,date_month,date_day,store_nbr_0,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,family_0,...,type_0,type_1,type_2,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,dcoilwtico,holiday?
0,-0.633647,-1.475632,-1.668899,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-0.474854,2.597893
1,-0.633647,-1.475632,-1.668899,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-0.474854,2.597893
2,-0.633647,-1.475632,-1.668899,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-0.474854,2.597893
3,-0.633647,-1.475632,-1.668899,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-0.474854,2.597893
4,-0.633647,-1.475632,-1.668899,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-0.474854,2.597893


In [14]:
df_X_valid_2.head()

Unnamed: 0,date_year,date_month,date_day,store_nbr_0,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,family_0,...,type_0,type_1,type_2,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,dcoilwtico,holiday?
0,0.694225,-0.848626,0.37395,-0.828663,1.205768,1.117911,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,-0.963738,-0.963797,-0.928497,-0.963501,0.309765,-0.384927
1,0.694225,-0.848626,0.37395,-0.828663,1.205768,1.117911,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,-0.963738,-0.963797,-0.928497,-0.963501,0.309765,-0.384927
2,0.694225,-0.848626,0.37395,-0.828663,1.205768,1.117911,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,-0.963738,-0.963797,-0.928497,-0.963501,0.309765,-0.384927
3,0.694225,-0.848626,0.37395,-0.828663,1.205768,1.117911,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,-0.963738,-0.963797,-0.928497,-0.963501,0.309765,-0.384927
4,0.694225,-0.848626,0.37395,-0.828663,1.205768,1.117911,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,-0.963738,-0.963797,-0.928497,-0.963501,0.309765,-0.384927


In [15]:
df_X_test_2.head()

Unnamed: 0,date_year,date_month,date_day,store_nbr_0,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,family_0,...,type_0,type_1,type_2,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,dcoilwtico,holiday?
0,4.677841,0.718889,0.033475,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-5.449875,-0.384927
1,4.677841,0.718889,0.033475,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-5.449875,-0.384927
2,4.677841,0.718889,0.033475,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-5.449875,-0.384927
3,4.677841,0.718889,0.033475,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-5.449875,-0.384927
4,4.677841,0.718889,0.033475,-0.828663,-0.829347,-0.894526,-0.963501,-0.96362,-0.999969,-0.176777,...,-0.282861,0.797573,1.03769,-0.137311,1.037626,1.037563,-0.928497,-0.963501,-5.449875,-0.384927


In [16]:
df_y_train_2 = df_y[0:int(3*len(df_y)/4)]
df_y_valid_2 = df_y[int(3*len(df_y)/4):]

In [17]:
df_y_train_2.head()

Unnamed: 0,sales
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [18]:
df_y_valid_2.head()

Unnamed: 0,sales
786431,588.0
786432,116.357
786433,119.0
786434,116.0
786435,1872.0


In [19]:
print("Number of rows in X training set   : ",len(df_X_train_2))
print("Number of rows in X validation set : ",len(df_X_valid_2))
print("Number of rows in X testing set    : ",len(df_X_test_2))

Number of rows in X training set   :  786431
Number of rows in X validation set :  262144
Number of rows in X testing set    :  28512


In [20]:
print("Number of rows in y training set   : ",len(df_y_train_2))
print("Number of rows in y validation set : ",len(df_y_valid_2))

Number of rows in y training set   :  786431
Number of rows in y validation set :  262144


In [21]:
X_train = df_X_train_2.to_numpy()
y_train = df_y_train_2.to_numpy()

X_valid = df_X_valid_2.to_numpy()
y_valid = df_y_valid_2.to_numpy()

X_test = df_X_test_2.to_numpy()

### Performing deep learning using tensorflow

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(1,activation="linear"),
])
optimize = tf.keras.optimizers.legacy.SGD(learning_rate=0.0005)
early_stopping = tf.keras.callbacks.EarlyStopping()