Predict whether a user opens the email or ignores it

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import keras
import keras.backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model, Sequential
from keras.layers import Dense
from keras.optimizers import Adam, SGD, RMSprop
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('train.csv')
data.info()

In [None]:
users_info = pd.read_csv('users.csv')
users_info.info()

In [None]:
# no null values for train.csv but we see many for user.csv
print(data.isnull().sum())
print()
print(users_info.isnull().sum())

In [None]:
data.grass_date.value_counts() #which day of the week

In [None]:
print(data.last_open_day.value_counts().sort_index(ascending=False))
print()
print(data.last_login_day.value_counts().sort_index(ascending=False))
print()
print(data.last_checkout_day.value_counts().sort_index(ascending=False))

In [None]:
# class size - imbalanced, we will need to use MCC for measurement instead of simple accuracy
data.open_flag.value_counts().plot(kind='bar');

In [None]:
users_info.age.isnull().sum()

In [None]:
def preprocess_data(raw_data):
    """
    This function cleans the data from train.csv 
    """
    processed_data = raw_data.copy()
    
    processed_data['last_open_day'] = processed_data['last_open_day'].apply(lambda x: -1 if x == "Never open" else int(x))
    processed_data['last_login_day'] = processed_data['last_login_day'].apply(lambda x: -1 if x == "Never login" else int(x))
    processed_data['last_checkout_day'] = processed_data['last_checkout_day'].apply(lambda x: -1 if x == "Never checkout" else int(x))
    
    # transform date of email sent into day of the week (0-6)
    processed_data['grass_date'] = processed_data['grass_date'].apply(lambda x: pd.to_datetime(x[:10]))
    processed_data['grass_date'] = processed_data['grass_date'].dt.dayofweek
    
    return processed_data

def preprocess_users(raw_users_data):
    """
    This function cleans the data from users.csv
    """
    processed_users = raw_users_data.copy()
    
    # handle null values in user.csv
    processed_users['attr_1'] = processed_users['attr_1'].apply(lambda x: -1 if np.isnan(x) else int(x))
    processed_users['attr_2'] = processed_users['attr_2'].apply(lambda x: -1 if np.isnan(x) else int(x))
    processed_users['attr_3'] = processed_users['attr_3'].apply(lambda x: -1 if np.isnan(x) else int(x))
    processed_users['age'] = processed_users['age'].apply(lambda x: -1 if np.isnan(x) else int(x))
    
    # make one-hot encoding for email domains
    processed_users = pd.get_dummies(processed_users, columns=['domain']) 
    
    return processed_users

In [None]:
processed_data, processed_user = preprocess_data(data), preprocess_users(users_info)

In [None]:
print(processed_data.info())
print()
print(processed_user.info())

In [None]:
merged_data = pd.merge(left=processed_data, right=processed_user, how='left', left_on='user_id', right_on='user_id')
merged_data
print(merged_data.info())

Perhaps PCA can be used

In [None]:
merged_data.drop(columns=['row_id'], inplace=True)
merged_data.info()

In [None]:
merged_data_corr = merged_data.corr()
merged_data_corr['open_flag'] # corr coeff for each feature wrt to open_flag


It seems like our target open_flag does not really seem to depend on that many features in our data

In [None]:
# 'RdBu_r', BrBG', 'coolwarm' are good diverging colormaps
merged_data_corr.style.background_gradient(cmap='RdBu_r').set_precision(2)

In [None]:
def email_clfr(nb_features, nb_hidden_layers, nb_hidden_units, learning_rate):
    """
    This function creates the classification model based on the given hyperparameters
    """
    if nb_hidden_layers != len(nb_hidden_units):
        print("List size of hidden_units must equal to hidden_layers")
        return None
    
    else:
        model = Sequential()
        for layer in range(nb_hidden_layers):
            if layer == 0:
                model.add(Dense(units=nb_hidden_units[layer], input_shape=(nb_features,), activation='relu'))
            else:
                model.add(Dense(units=nb_hidden_units[layer], activation='relu'))
                
        model.add(Dense(units=1, activation='sigmoid'))
        
        model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics = ['accuracy', matthews_correlation])
        
        print(model.summary())
        return model

In [None]:
# custom metric for evaluation of imbalanced dataset
def matthews_correlation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

In [None]:
y = merged_data.iloc[:,16:17]
X = merged_data.drop(columns=['open_flag'],axis=0)

In [None]:
X = X[['country_code',
       'last_open_day', 
       'open_count_last_10_days', 
       'open_count_last_30_days', 
       'open_count_last_60_days',
       'attr_1', 
       'attr_2', 
       'attr_3',
       'domain_@163.com', 
       'domain_@gmail.com', 
       'domain_@hotmail.com',
       'domain_@icloud.com', 
       'domain_@live.com', 
       'domain_@outlook.com',
       'domain_@qq.com', 
       'domain_@rocketmail.com', 
       'domain_@yahoo.com',
       'domain_@ymail.com', 
       'domain_other']]
X.columns
print(len(X.columns))

In [None]:
scaler = MinMaxScaler(feature_range = (0,1))

In [None]:
X_scaled = scaler.fit_transform(X)
print(X_scaled.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)

In [None]:
print("X_train shape: " + str(X_train.shape))
print("y_train shape: " + str(y_train.shape))
print()
print("X_test shape: " + str(X_test.shape))
print("y_test shape: " + str(y_test.shape))

In [None]:
# Early stopping
es = EarlyStopping(monitor='matthews_correlation', mode='max', verbose=1, patience=2000)
mc = ModelCheckpoint('best_model.h5', monitor='val_matthews_correlation', mode='max', verbose=1, save_best_only=True)

In [None]:
classifier = email_clfr(nb_features=19, nb_hidden_layers=5, nb_hidden_units=[10,10,10,10,10], learning_rate=0.01)

In [None]:
history = classifier.fit(X_train, y_train, validation_data=(X_test, y_test), epochs = 20000, batch_size = 1024, callbacks = [es,mc])

In [None]:
#load the saved model
saved_model = load_model('best_model.h5', custom_objects={'matthews_correlation': matthews_correlation})

# evaluate the model
_, train_acc, train_mcc = saved_model.evaluate(X_train, y_train, verbose=0)
_, test_acc, test_mcc = saved_model.evaluate(X_test, y_test, verbose=0)

print("Train Acc: " + str(train_acc))
print("Train MCC: " + str(train_mcc))
print()
print("Test Acc: " + str(test_acc))
print("Test MCC: " + str(test_mcc))

In [None]:
test_data = pd.read_csv('test.csv')
processed_test = preprocess_data(test_data)

merged_test_data = pd.merge(left=processed_test, right=processed_user, how='left', left_on='user_id', right_on='user_id')
merged_test_data

merged_test_data.drop(columns=['row_id','age'], inplace=True)

In [None]:
merged_test_data = merged_test_data[['country_code', 
                                     'last_open_day', 
                                     'open_count_last_10_days', 
                                     'open_count_last_30_days',
                                     'open_count_last_60_days',
                                     'attr_1', 
                                     'attr_2', 
                                     'attr_3',
                                     'domain_@163.com', 
                                     'domain_@gmail.com', 
                                     'domain_@hotmail.com',
                                     'domain_@icloud.com', 
                                     'domain_@live.com', 
                                     'domain_@outlook.com',
                                     'domain_@qq.com', 
                                     'domain_@rocketmail.com', 
                                     'domain_@yahoo.com',
                                     'domain_@ymail.com', 
                                     'domain_other']]
merged_test_data

In [None]:
scaled_test = scaler.fit_transform(merged_test_data)

Prediction

In [None]:
open_flags = saved_model.predict(scaled_test)
open_flags = (open_flags[:,0] > .5).astype(int)

In [None]:
submission = pd.DataFrame({"row_id": test_data['row_id'],
                           "open_flag": open_flags})

submission.to_csv("submission_15.csv", index=False)

In [None]:
# save weights only
classifier.save_weights('classifier_15.h5')

# save architecture only
model_architecture = classifier.to_json()
with open('classifier_15.json','w') as json_file:
    json_file.write(model_architecture)

In [None]:
# # Load model architecture and weights
# from keras.models import model_from_json
# with open('Final_Model_architecture.json','r') as json_file:
#     architecture = json_file.read()
    
# model = model_from_json(architecture)
# model.load_weights('Final_Model_weights.h5')