In [2]:
from __future__ import division, print_function
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
import argparse
import functools
# Our features
import feature_engineering
import Preprocessing
# XGBoost library
import xgboost as xgb
from xgboost import XGBClassifier
# Keras library
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, merge
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
# Garbage Collection
import gc

### Feature engineering 

In [51]:
initial_train, initial_test = feature_engineering.create_preprocessed_features()

In [50]:
tfidf_train, tfidf_test = feature_engineering.create_tfidf_features()

In [5]:
pr_train, pr_test = feature_engineering.create_pagerank_features()

In [6]:
X_train = pd.concat([initial_train, tfidf_train, pr_train], axis=1)
X_test = pd.concat([initial_test, tfidf_test, pr_test], axis=1)

In [None]:
# save X_train and X_test to be re-used for the LSTM pre-processing
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')

In [7]:
y_train = X_train.is_duplicate
X_train = X_train.drop('is_duplicate', axis=1)

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=4242)

In [9]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((72090, 59), (72090,), (8010, 59), (8010,))

In [10]:
X_train = X_train.drop(['id', 'id1', 'id2', "question1", "question2"],
                      axis=1)
X_valid = X_valid.drop(['id', 'id1', 'id2', "question1", "question2"],
                      axis=1)
X_test = X_test.drop(['id', 'id1', 'id2', "question1", "question2"],
                      axis=1)

### XGBoost 

#### Simple model

In [16]:
# Finetuned parameters
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['n_jobs'] = 5
params['max_depth'] = 6
params['subsample'] = 0.6
params['base_score'] = 0.2

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [18]:
bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=200)
print(log_loss(y_valid, bst.predict(d_valid)))

In [None]:
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)

#### Multi XGBoost for averaging
We run multiple XGBoost models with different parameters (depth, jobs, base_score, etc.) in order to average their predictions and the predictions of a LSTM (see below)

In [47]:
df_tot = pd.DataFrame()
for i in range(9):
    np.random.seed(i+1)
    params = {}
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    et = [.02,.025,.01,.015]
    params['eta'] = np.random.choice(et)
    params['n_jobs'] = 5
    depth = [4,5,6,7]
    params['max_depth'] = np.random.choice(depth)
    sub = [.5,.6,.7,.4]
    params['subsample'] = np.random.choice(sub)
    params['base_score'] = 0.2
    col = [1,.7]
    params['colsample_bytree'] = np.random.choice(col)

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_valid, label=y_valid)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=2500)

    d_test = xgb.DMatrix(X_test)
    p_test = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)
    df_tot[str('fold'+ str(i+1))] = p_test

In [12]:
df_tot.to_csv('xgb_final.csv', index=False)

### LSTM

In [49]:
data_1, data_2, test_data_1, test_data_2, test_ids, labels, word_index = Preprocessing.preprocessing()
leaks, test_leaks = Preprocessing.load_leaky()

data = Preprocessing.input_nn_data(data_1, data_2, test_data_1, test_data_2, test_ids, labels, leaks, test_leaks)
data_1_train = data["data_1_train"] 
data_2_train = data["data_2_train"] 
leaks_train = data["leaks_train"]
labels_train = data["labels_train"]
data_1_val = data["data_1_val"]
data_2_val = data["data_2_val"]
leaks_val = data["leaks_val"]
labels_val = data["labels_val"]
weight_val = data["weight_val"]
test = data["test"]

#### Word Embedding
Glove link https://nlp.stanford.edu/projects/glove/

In [20]:
embedding_index = Preprocessing.Glove_Indexing()
embedding_matrix = Preprocessing.Words_Embedding(word_index, embedding_index)

Indexing word vectors.
Preparing embedding matrix
Null word embeddings: 1


In [48]:
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 144180
# words to vector in glovespace
embedding_layer = Embedding(20355,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
_ = gc.collect()

In [30]:
act = 'relu'
test_ids = test.test_id
# seed
i = 5
np.random.seed(i)
print ('fold = ' + str(i))
print ('--> Initializing LSTM')

#initialize with random values every time with different seed
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
lstm_drp = 0.15 + np.random.rand() * 0.25
leak_drp = 0.15 + np.random.rand() * 0.25
gc.collect()

#----------------------------------------------------------------------------------
#model structure
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(40,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)
x1 = Dropout(lstm_drp)(x1)

sequence_2_input = Input(shape=(40,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)
y1 = Dropout(lstm_drp)(y1)

leaks_input = Input(shape=(leaks_train.shape[1],))
leaks_dense = Dense(int(num_dense/2), activation=act)(leaks_input)
leaks_dense = Dropout(leak_drp)(leaks_dense)

merged = merge([x1, y1, leaks_dense],'concat')
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)

class_weight = None
weight_val = np.ones(len(labels_val))

print ('--> LSTM Model Created')

#----------------------------------------------------------------------------------
model = Model([sequence_1_input, sequence_2_input, leaks_input], preds)
model.compile(loss='binary_crossentropy',metrics=['acc'], optimizer='nadam')

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = 'fold' + str(i) + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
print('--> LSTM Model Compiled')

#----------------------------------------------------------------------------------
gc.collect()
hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train,
                 validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val),
                 epochs=50, batch_size=2048, shuffle=True, verbose=2,
                 class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

#----------------------------------------------------------------------------------
print('--> Testing')
model.load_weights('fold' + str(i) + '.h5')
predictions = model.predict([test_data_1, test_data_2, test_leaks], batch_size=2000, verbose=0)
predictions += model.predict([test_data_1, test_data_2, test_leaks], batch_size=2000, verbose=0)
predictions /= 2

#----------------------------------------------------------------------------------
score = pd.DataFrame()
score["score"] = predictions.reshape(len(predictions))
score.to_csv("lstm_final.csv")

### Averaging

In [46]:
sub1 = pd.read_csv('lstm_final.csv', index_col=0)
sub2 = pd.read_csv('xgb_final.csv')
sub2["Lstm"] = sub1["score"]
sub2["Lstm2"] = sub1["score"]
dup = sub2.mean(axis = 1)
dup = pd.DataFrame(dup)
dup = dup.reset_index()
dup.columns = ['Id', 'Score']
dup.to_csv('submission.csv', index=False)