In [1]:
!pip install keras



In [2]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import plotly.plotly as py
from plotly.graph_objs import *
#import helper
from cStringIO import StringIO

Using TensorFlow backend.


In [3]:

%%storage read --object "gs://fresh-waters-176302/train.csv" --variable text

In [4]:
def create_features() :
    data = pd.read_csv(StringIO(text)) 
    data = data.dropna()
    # Transforming weekday into numbers.
    data = data.replace("Monday", 1)
    data = data.replace("Tuesday", 2)
    data = data.replace("Wednesday", 3)
    data = data.replace("Thursday", 4)
    data = data.replace("Friday", 5)
    data = data.replace("Saturday", 6)
    data = data.replace("Sunday", 7)
    descriptionDummies = pd.get_dummies(data.DepartmentDescription)
    descriptionDummies = descriptionDummies.apply(lambda x: x*data["ScanCount"])
    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0
    data = data[["TripType", "VisitNumber", "Weekday", "ScanCount", "Return"]]
    dataPart1 = data.groupby("VisitNumber").agg({'Weekday': np.max, "TripType": np.max, 'ScanCount': np.sum, 'Return': np.max})
    dataPart2 = pd.concat([data.VisitNumber,descriptionDummies],axis=1)
    dataPart2 = dataPart2.groupby("VisitNumber").agg('sum')
    data = pd.concat([dataPart1, dataPart2], axis=1)
    return data

In [5]:
data = create_features()

In [6]:
data.head()

Unnamed: 0_level_0,TripType,Return,Weekday,ScanCount,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,999,1.0,5,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,30,0.0,5,2,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,26,1.0,5,27,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,8,0.0,5,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,8,0.0,5,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
from sklearn.cross_validation import train_test_split
Y = data.iloc[:,0]
X = data.iloc[:,1:]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
from sklearn import preprocessing
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test) 
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
dim = X_train.shape[1]
print (dim, 'Dimension')
num_classes = 38

print(Y_train.shape)
print(X_train.shape)

Y_train = Y_train.map({3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 12:7, 14:8, 15:9,  
                                         18:10, 19:11, 20:12, 21:13, 22:14, 23:15, 24:16, 25:17, 26:18,  
                                         27:19, 28:20, 29:21, 30:22, 31:23, 32:24, 33:25, 34:26, 35:27, 
                                         36:28, 37:29, 38:30, 39:31, 40:32, 41:33, 42:34, 43:35, 44:36, 999:37})

Y_test = Y_test.map({3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 12:7, 14:8, 15:9,  
                                         18:10, 19:11, 20:12, 21:13, 22:14, 23:15, 24:16, 25:17, 26:18,  
                                         27:19, 28:20, 29:21, 30:22, 31:23, 32:24, 33:25, 34:26, 35:27, 
                                         36:28, 37:29, 38:30, 39:31, 40:32, 41:33, 42:34, 43:35, 44:36, 999:37})

Y_train = keras.utils.to_categorical(Y_train, num_classes)
Y_test = keras.utils.to_categorical(Y_test, num_classes)

print(Y_train.shape)
X_train

65972 train samples
28275 test samples
71 Dimension
(65972,)
(65972, 71)
(65972, 38)


array([[ -0.36201102,  -0.65048047,   0.52862159, ...,  -0.04292013,
         -0.13490314,  -0.09200242],
       [ -0.36201102,   0.33577598,  -0.44633301, ...,  -0.04292013,
         -0.13490314,  -0.09200242],
       [ -0.36201102,  -0.65048047,   0.23613521, ...,  -0.04292013,
         -0.13490314,  -0.09200242],
       ..., 
       [ -0.36201102,  -0.15735224,  -0.44633301, ...,  -0.04292013,
         -0.13490314,   5.27931069],
       [ -0.36201102,   0.33577598,  -0.54382847, ...,  -0.04292013,
         -0.13490314,  10.6506238 ],
       [ -0.36201102,   0.8289042 ,  -0.64132393, ...,  -0.04292013,
         -0.13490314,  -0.09200242]])

In [69]:
model = Sequential()
model.add(Dense(dim, input_dim= dim, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(num_classes, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X_train, Y_train, epochs=20, batch_size=30000)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb1bf298dd0>

In [80]:
scores = model.evaluate(X_test, Y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 97.41%


In [144]:
%%storage read --object "gs://fresh-waters-176302/test.csv" --variable text


In [122]:
def create_test_features() :
    data = pd.read_csv(StringIO(text)) 
    data = data.dropna()
    # Transforming weekday into numbers.
    data = data.replace("Monday", 1)
    data = data.replace("Tuesday", 2)
    data = data.replace("Wednesday", 3)
    data = data.replace("Thursday", 4)
    data = data.replace("Friday", 5)
    data = data.replace("Saturday", 6)
    data = data.replace("Sunday", 7)
    descriptionDummies = pd.get_dummies(data.DepartmentDescription)
    descriptionDummies = descriptionDummies.apply(lambda x: x*data["ScanCount"])
    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0
    data = data[["VisitNumber", "Weekday", "ScanCount", "Return"]]
    dataPart1 = data.groupby("VisitNumber").agg({'Weekday': np.max, 'ScanCount': np.sum, 'Return': np.max})
    dataPart2 = pd.concat([data.VisitNumber,descriptionDummies],axis=1)
    dataPart2 = dataPart2.groupby("VisitNumber").agg('sum')
    data = pd.concat([dataPart1, dataPart2], axis=1)
    return data

In [145]:
testdata = pd.read_csv(StringIO(text)) 
print(testdata.shape)
testdata = create_test_features()
#testdata.rename(columns ={'X' : ''}, inplace=False)
testdata['HEALTH AND BEAUTY AIDS'] = 0
testdata
print(testdata.shape)

(653646, 6)
(94288, 71)


In [130]:
list1 = data.columns.tolist()
list2 = testdata.columns.tolist()
list(set(list1) - set(list2))

['TripType']

In [None]:
testdata = testdata.astype('float32')
testdata = preprocessing.scale(testdata)
predicted = model.predict(testdata)
print(predicted.shape)
rounded = predicted.round()
numpy.savetxt("upload.csv", rounded, delimiter=",")

#predicted_final = predicted.map({0:3, 1:4, 2:5, 3:6, 4:7, 5:8, 6:9, 7:12, 8:14, 9:15,  
                                         #10:18, 11:19, 12:20, 13:21, 14:22, 15:23, 16:24, 17:25, 18:26,  
                                         #19:27, 20:28, 21:29, 22:30, 23:31, 24:32, 25:33, 26:34, 27:35, 
                                         #28:36, 29:37, 30:38, 31:39, 32:40, 33:41, 34:42, 35:43, 36:44, 37:999})