In [157]:
# Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [158]:
# Import Data
train_data = pd.read_csv('../Data/train.csv')
test_data = pd.read_csv('../Data/test.csv')

print('train size is {}'.format(train_data.shape))
print('test size is {}'.format(test_data.shape))

train size is (595212, 59)
test size is (892816, 58)


In [159]:
# Resample
from sklearn.utils import resample
from sklearn.utils import shuffle
train_0 = train_data[train_data['target']==0]
train_1 = train_data[train_data['target']==1]

sample_1 = len(train_1)
resampled_0 = resample(train_0, n_samples=int(2*sample_1), random_state=42)

resampled_train_data = pd.concat([train_1, resampled_0])
resampled_train_data = shuffle(resampled_train_data, random_state=0)


In [160]:
# Check Data
# print(train_data['target'])

In [161]:
# Data Analysis
train_id = resampled_train_data['id']
test_id = test_data['id']
# check to see if there is any overlap
if (set(resampled_train_data) & set(test_id)):
    print('id Overlap')
# remove id from both train and test set
resampled_train_data.drop('id',axis=1,inplace=True)
test_data.drop('id',axis=1,inplace=True)

response = resampled_train_data['target']

resampled_train_data.drop('target',axis=1,inplace=True)
# train_data = train_data.replace(-1,np.NaN)
# test_data = test_data.replace(-1,np.NaN)
print('train size: {}'.format(resampled_train_data.shape))
print('test size: {}'.format(test_data.shape))
print('Count of positive: {}'.format((response==1).sum()))
print('Ratio of positive: {}'.format((response==1).sum()/response.shape[0]))

train size: (65082, 57)
test size: (892816, 57)
Count of positive: 21694
Ratio of positive: 0.3333333333333333


In [162]:
# One hot data
from sklearn.preprocessing import OneHotEncoder
# def oneHotEncode(data, enc, cat_features):
#     cat_features = []
#     enc = OneHotEncoder(categorical_features = cat_features)
#     enc.fit(data)
#     data = enc.transform(data)
#     return data

def data_transform(data):
    for feature in data.columns:
        if feature.endswith('cat'):
            data[feature] = data[feature].replace(-1,100)
    return data

def get_cat_features(data):
    cat_features = []
    for i, feature in enumerate(data.columns):
        if feature.endswith('cat'):
            cat_features.append(i)
    return cat_features
                                      
# def oneHotEncode(data, enc):
#     data = enc.transform(data)
#     return data
    
cat_features = get_cat_features(resampled_train_data)
resampled_train_data = data_transform(resampled_train_data)
test_data = data_transform(test_data)
                                      
enc = OneHotEncoder(categorical_features = cat_features, sparse = False)     
enc.fit(test_data)

test_data = enc.transform(test_data)

In [163]:
resampled_train_data = enc.transform(resampled_train_data)

In [164]:
# Categorize Data
# resampled_train_data = oneHotEncode(resampled_train_data, enc)
# test_data = oneHotEncode(test_data, enc)

In [165]:
print(resampled_train_data.shape)
print(test_data.shape)
print(resampled_train_data[0])

(65082, 227)
(892816, 227)
[ 1.          0.          0.          0.          0.          1.          0.
  0.          1.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          1.
  0.          0.          1.          0.          0.          0.          1.
  0.          0.          1.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          1.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          1.          0.          0.          0.
  0.          0.          0.          0.          1.          0.          0.
  1.          0.          0.          1.          0.          0.          0.
  0.          1.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         

In [166]:
# Normalize Data
from sklearn.preprocessing import normalize
norm_resampled_train_data = normalize(resampled_train_data)
norm_test_data = normalize(test_data)
print(norm_resampled_train_data.shape)
print(norm_test_data.shape)

(65082, 227)
(892816, 227)


In [167]:
# train_np = np.array(norm_resampled_train_data)
# print(norm_resampled_train_data.shape)

In [168]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 100)
pca.fit(norm_resampled_train_data)

norm_resampled_train_data = pca.transform(norm_resampled_train_data)
norm_test_data = pca.transform(norm_test_data)

print(norm_resampled_train_data.shape)
print(norm_test_data.shape)

(65082, 100)
(892816, 100)


In [169]:
from sklearn.model_selection import train_test_split
train_np = np.array(norm_resampled_train_data)
response_np = np.array(response)
X_train, X_valid, y_train, y_valid = train_test_split(
    train_np, response_np, test_size=0.1, random_state=18)
# Normalize
X_test = np.array(norm_test_data)

In [170]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(58573, 100)
(58573,)
(6509, 100)
(6509,)


In [17]:
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential

Using TensorFlow backend.


In [180]:
# Model Definition
model = Sequential()

### TODO: Define your architecture.
model.add(Dense(32, activation='relu', input_dim=100)) #input_shape = (57,)))
# model.add(Dropout(0.2))
model.add(Dense(64, activation = 'relu'))
# model.add(Dropout(0.1))
model.add(Dense(128, activation = 'relu'))
# model.add(Dropout(0.1))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 32)                3232      
_________________________________________________________________
dense_36 (Dense)             (None, 64)                2112      
_________________________________________________________________
dense_37 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_38 (Dense)             (None, 1)                 129       
Total params: 13,793
Trainable params: 13,793
Non-trainable params: 0
_________________________________________________________________


In [181]:
# Compile Model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [182]:
# Train Model
from keras.callbacks import ModelCheckpoint 
epochs = 300

checkpointer = ModelCheckpoint(filepath='saved_models/weights.test_run', 
                               verbose=1, save_best_only=True)
model.fit(X_train, y_train, 
          validation_data=(X_valid, y_valid),
          epochs=epochs, callbacks=[checkpointer], batch_size = 20000, verbose=1)

Train on 58573 samples, validate on 6509 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/30

<keras.callbacks.History at 0x7feba8ce79e8>

In [135]:
# Evaluate
model.load_weights('saved_models/weights.test_run')
prediction = model.predict(X_test, batch_size = 90000)
# model.evaluate(resampled_train_data.values, response.values, batch_size = 90000)

In [302]:
# sum(prediction == 1)

In [136]:
# print (predition)
df = pd.DataFrame(prediction, index = test_id, columns=['target'])
df.to_csv('test.csv')

In [137]:
print(df.shape)

(892816, 1)
