In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l2

Using Theano backend.


In [2]:
# Read file
merged = pd.read_csv("merged_full.csv")
merged = merged.drop(["UserId", "DOB", "State"], 1)

In [9]:
# Keep only columns that have more than 5 profiles that visited
merged = merged.loc[:, merged.sum(0) > 5]

In [11]:
# Keep only rows that have visited more than 5 URLs
merged = merged.loc[merged.sum(1) > 5, :]

In [12]:
# Check number of observations per gender. The set is unbalanced!
# By predicting always "M", we will have ~75% accuracy.
males = merged[merged["Gender"] == "M"]
females = merged[merged["Gender"] == "F"]
print males.shape[0], females.shape[0]

10267 3087


In [30]:
# Make training set balanced
males_samp = males.sample(n=females.shape[0], axis=0)

In [31]:
# Make training set
train = pd.concat([males_samp, females], ignore_index=True)

In [32]:
# Shuffle training set
train = train.sample(frac=1).reset_index(drop=True)

In [135]:
#train = merged[merged["Gender"].isin(["M", "F"])]

In [33]:
# Make testing set
test = merged[merged["Gender"] == " "]

In [34]:
# Create training responses
train_X = train.drop("Gender", 1)
train_Y = train["Gender"]

In [35]:
# Convert to binary matrix
train_x = train_X.as_matrix()
train_y = train_Y.apply(lambda x: 1 if x == "M" else 0)

In [36]:
# Drop 'Gender' information from testing set (it's what we want to predict)
test_X = test.drop("Gender", 1)

In [37]:
# Convert to binary matrix
test_x = test_X.as_matrix()

In [38]:
# Get number of columns to use as input dimension for the NN
num_dim = train_X.shape[1]

In [39]:
# 3 hidden layers NN model
def get_3nn_model(lr=0.001, M1=300, w1=0.2, M2=240, w2=0.2, M3=200, w3=0.2, w=0.1):
    model = Sequential()
    model.add(Dense(M1, input_dim=num_dim, kernel_initializer='normal', activation='relu', kernel_regularizer=l2(w)))
    model.add(Dropout(w1))
    model.add(Dense(M2, input_dim=num_dim, activation='relu'))
    model.add(Dropout(w2))
    model.add(Dense(M3, input_dim=num_dim, activation='relu'))
    model.add(Dropout(w3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['accuracy'])
    model.optimizer.lr.set_value(lr)
    return model

In [40]:
# Create model
model = get_3nn_model()

In [41]:
# Get summar information about model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 300)               26100     
_________________________________________________________________
dropout_4 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 240)               72240     
_________________________________________________________________
dropout_5 (Dropout)          (None, 240)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 200)               48200     
_________________________________________________________________
dropout_6 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 201       
Total para

In [46]:
# Fit for a few epochs, see how it performs
model.fit(train_x, train_y, epochs=10, batch_size=10, verbose=2, validation_split=0.2)

Train on 4939 samples, validate on 1235 samples
Epoch 1/10
6s - loss: 0.6954 - acc: 0.5758 - val_loss: 0.6864 - val_acc: 0.5992
Epoch 2/10
6s - loss: 0.6892 - acc: 0.5801 - val_loss: 0.6981 - val_acc: 0.5522
Epoch 3/10
7s - loss: 0.6934 - acc: 0.5827 - val_loss: 0.6899 - val_acc: 0.5757
Epoch 4/10
6s - loss: 0.6947 - acc: 0.5821 - val_loss: 0.6930 - val_acc: 0.5789
Epoch 5/10
7s - loss: 0.6941 - acc: 0.5785 - val_loss: 0.6943 - val_acc: 0.5595
Epoch 6/10
6s - loss: 0.6869 - acc: 0.5874 - val_loss: 0.6936 - val_acc: 0.5684
Epoch 7/10
6s - loss: 0.6887 - acc: 0.5880 - val_loss: 0.6906 - val_acc: 0.5749
Epoch 8/10
6s - loss: 0.6867 - acc: 0.5799 - val_loss: 0.6920 - val_acc: 0.5684
Epoch 9/10
7s - loss: 0.6886 - acc: 0.5791 - val_loss: 0.6999 - val_acc: 0.5789
Epoch 10/10
7s - loss: 0.6864 - acc: 0.5870 - val_loss: 0.6974 - val_acc: 0.5611


<keras.callbacks.History at 0x12b42b1d0>

In [43]:
# Create predictions
predictions = model.predict(test_x)

In [44]:
# Convert predictions back to "M" and "F"
rounded = ["M" if round(x[0]) == 1.0 else "F" for x in predictions]

In [45]:
print "'M':",  sum([elem == "M" for elem in rounded])
print "'F':", sum([elem == "F" for elem in rounded])

'M': 2810
'F': 3012
