In [586]:
from __future__ import absolute_import, division, print_function, unicode_literals
import functools

import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [587]:
csv_file = "/Users/alexanderchen/Google Drive (alex.chen.h@yale.edu)/Projects/Hack Harvard 2019/customers.csv"

In [588]:
df = pd.read_csv(csv_file)

In [589]:
df.head()

Unnamed: 0,CreditCard,Age,Student,AccountBalance,CS,GEXP1,DIN1,TRANS1,ENT1,GROC1,...,GROC2,HOTEL2,AIR2,GEXP3,DIN3,TRANS3,ENT3,GROC3,HOTEL3,AIR3
0,QuickSilverOne,1,1,u100,Average,4350.426187,2780.014774,3534.051017,2813.531871,2433.390448,...,3983.29,721.56,2169.69,8241.17,1917.72,197043.53,3100.2,5554.97,1062.73,3726.34
1,JourneyStudent,2,1,o10000,Average,4230.872548,2721.123376,3525.241001,2816.375447,2970.424212,...,4452.8,842.02,3040.12,10820.87,2342.69,241154.71,1980.95,4392.14,697.27,4353.28
2,QuickSilverOne,5,0,o10000,Average,4836.533691,3137.231798,4205.282185,3229.076565,2484.837704,...,3278.51,499.47,1830.31,10175.12,2522.14,4613.07,2775.48,2884.93,730.12,1941.41
3,JourneyStudent,2,1,01000u10000,Average,4288.000046,2736.386884,3663.947996,2002.497257,3034.466426,...,3233.94,531.86,1921.96,6671.59,4187.69,223120.5,3131.77,5307.49,724.3,3091.07
4,QuickSilverOne,1,1,o100u1000,Average,4380.721651,2825.678997,3241.745528,2778.379019,2898.353526,...,4575.31,778.42,2659.15,7983.73,2927.02,232661.83,5890.42,5336.52,689.72,4020.44


In [590]:
df.dtypes

CreditCard         object
Age                 int64
Student             int64
AccountBalance     object
CS                 object
GEXP1             float64
DIN1              float64
TRANS1            float64
ENT1              float64
GROC1             float64
HOTEL1            float64
AIR1              float64
GEXP2             float64
DIN2              float64
TRANS2            float64
ENT2              float64
GROC2             float64
HOTEL2            float64
AIR2              float64
GEXP3             float64
DIN3              float64
TRANS3            float64
ENT3              float64
GROC3             float64
HOTEL3            float64
AIR3              float64
dtype: object

In [591]:
df['CreditCard'] = pd.Categorical(df['CreditCard'])
df['CreditCard'] = df.CreditCard.cat.codes

In [592]:
df.head()

Unnamed: 0,CreditCard,Age,Student,AccountBalance,CS,GEXP1,DIN1,TRANS1,ENT1,GROC1,...,GROC2,HOTEL2,AIR2,GEXP3,DIN3,TRANS3,ENT3,GROC3,HOTEL3,AIR3
0,2,1,1,u100,Average,4350.426187,2780.014774,3534.051017,2813.531871,2433.390448,...,3983.29,721.56,2169.69,8241.17,1917.72,197043.53,3100.2,5554.97,1062.73,3726.34
1,1,2,1,o10000,Average,4230.872548,2721.123376,3525.241001,2816.375447,2970.424212,...,4452.8,842.02,3040.12,10820.87,2342.69,241154.71,1980.95,4392.14,697.27,4353.28
2,2,5,0,o10000,Average,4836.533691,3137.231798,4205.282185,3229.076565,2484.837704,...,3278.51,499.47,1830.31,10175.12,2522.14,4613.07,2775.48,2884.93,730.12,1941.41
3,1,2,1,01000u10000,Average,4288.000046,2736.386884,3663.947996,2002.497257,3034.466426,...,3233.94,531.86,1921.96,6671.59,4187.69,223120.5,3131.77,5307.49,724.3,3091.07
4,2,1,1,o100u1000,Average,4380.721651,2825.678997,3241.745528,2778.379019,2898.353526,...,4575.31,778.42,2659.15,7983.73,2927.02,232661.83,5890.42,5336.52,689.72,4020.44


In [593]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

64000 train examples
16000 validation examples
20000 test examples


In [594]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('CreditCard')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

# Build the model

In [595]:
feature_columns = []

# numeric cols
for header in ['GEXP1', 'DIN1', 'TRANS1', 'ENT1', 'GROC1', 'HOTEL1', 'AIR1',
               'GEXP2', 'DIN2', 'TRANS2', 'ENT2', 'GROC2', 'HOTEL2', 'AIR2',
               'GEXP3', 'DIN3', 'TRANS3', 'ENT3', 'GROC3', 'HOTEL3', 'AIR3']:
  feature_columns.append(feature_column.numeric_column(header))

# indicator cols
AccountBalance = feature_column.categorical_column_with_vocabulary_list(
      'AccountBalance', ['u100', 'o100u1000', 'o1000u10000', 'o10000'])
AB_one_hot = feature_column.indicator_column(AccountBalance)
feature_columns.append(AB_one_hot)

CS = feature_column.categorical_column_with_vocabulary_list(
      'CS', ['Average', 'Excellent'])
CS_one_hot = feature_column.indicator_column(CS)
feature_columns.append(CS_one_hot)

Student = feature_column.categorical_column_with_vocabulary_list(
      'Student', [0, 1])
Student_one_hot = feature_column.indicator_column(Student)
feature_columns.append(Student_one_hot)

In [596]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [597]:
batch_size = 10
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=True, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=True, batch_size=batch_size)

In [605]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(256, activation='relu'),
  layers.Dense(256, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='Adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=2)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1a6be07278>

In [606]:
# Check accuracy
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.14775


In [607]:
# Infer labels on a batch
predictions = model.predict(test_ds)