In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Bidirectional
from keras.layers.core import Lambda
from keras.layers.merge import concatenate, add, multiply
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.noise import GaussianNoise
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

Using TensorFlow backend.


In [2]:
np.random.seed(0)
WNL = WordNetLemmatizer() # Used to lemmatize words
STOP_WORDS = set(stopwords.words('english'))
MAX_SEQUENCE_LENGTH = 30 # The word sequence length of each processed question sentence
MIN_WORD_OCCURRENCE = 100 # The words/phrases that occurr lower than this number of times will be replaced with the following REPLACE_WORD
REPLACE_WORD = "memento" # The word use to replace the words/phrases that occurr lower than specific times
EMBEDDING_DIM = 300 # The dimension of each word embedding vector
NUM_FOLDS = 10 # The number of folds in the k-folds cross-valid trainig
BATCH_SIZE = 1025 # The used batch size in the training
EMBEDDING_FILE = "glove.840B.300d.txt" # The filename of the GloVe Word Embeddings file

In [3]:
# Lemmatizing words with length greater than 4.
def cutter(word):
    if len(word) < 4:
        return word
    return WNL.lemmatize(WNL.lemmatize(word, "n"), "v")

In [4]:
# Cleaning texts
def preprocess(string):
    
    string = string.lower()
    
    string.replace(",000,000", "m")
    string.replace(",000", "k")
    string.replace("′", "'")
    string.replace("’", "'")
    string.replace("won't", "will not")
    string.replace("cannot", "can not")
    string.replace("can't", "can not")
    string.replace("n't", " not")
    string.replace("what's", "what is")
    string.replace("that's", "that is")
    string.replace("it's", "it is")
    string.replace("'ve", " have")
    string.replace("i'm", "i am")
    string.replace("'re", " are")
    string.replace("he's", "he is")
    string.replace("she's", "she is")
    string.replace("'s", " own")
    string.replace("%", " percent ")
    string.replace("₹", " rupee ")
    string.replace("$", " dollar ")
    string.replace("€", " euro ")
    string.replace("'ll", " will")
    string.replace("'d", " would")
    string.replace("=", " equal ")
    string.replace("+", " plus ")
        
    string = re.sub(r"e-mail", "email", string)
  
    string = re.sub(r" usa ", " america ", string)

    string = re.sub(r"the us", "america", string)
    string = re.sub(r" uk ", " england ", string)
    string = re.sub(r"c#", "c sharp", string)
    
    string = re.sub(r" cs ", " computer science ", string) 
    
    string = re.sub('[“”\(\'…\)\!\^\"\.;:,\-\?？\{\}\[\]\\/\*@]', ' ', string)
    string = re.sub(r"([0-9]+)000000", r"\1m", string)
    string = re.sub(r"([0-9]+)000", r"\1k", string)
    
    string = ' '.join([cutter(w) for w in string.split()])
    
    return string

In [5]:
# Generate and return a map of the high frequency words' embeddings from the file of GloVe Word Embeddings
def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE, encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        if len(values) == EMBEDDING_DIM + 1 and word in top_words:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    f.close()
    return embeddings_index

In [6]:
# return if one word is numeric by checking if it contains any digit character
def is_numeric(s):
    return any(i.isdigit() for i in s)

In [7]:
# Get a question sentence as an input and output one list and two sets
# The ouput new_q list is the input sentence's word list with all the low frequency words/phrases replaced with "memento"
# The ouput surplus_q set is the non-numeric word set of the input sentence
# The ouput numbers_q set is the numeric word set of the input sentence
def prepare(q):
    new_q = []
    surplus_q = []
    numbers_q = []
    new_memento = True
    for w in q.split()[::-1]:
        if w in top_words:
            new_q = [w] + new_q
            new_memento = True
        elif w not in STOP_WORDS:
            if new_memento:
                new_q = ["memento"] + new_q
                new_memento = False
            if is_numeric(w):
                numbers_q = [w] + numbers_q
            else:
                surplus_q = [w] + surplus_q
        else:
            new_memento = True
        if len(new_q) == MAX_SEQUENCE_LENGTH:
            break
    new_q = " ".join(new_q)
    return new_q, set(surplus_q), set(numbers_q)

In [8]:
# Extract the following four additional features from the data set:
# The length of the intersection set of two questions' non-numeric word sets
# The length of the union set of two questions' non-numeric word sets
# The length of the intersection set of two questions' numeric word sets
# The length of the union set of two questions' numeric word sets
def extract_features(df):
    q1s = np.array([""] * len(df), dtype=object)
    q2s = np.array([""] * len(df), dtype=object)
    features = np.zeros((len(df), 4))

    for i, (q1, q2) in enumerate(list(zip(df["question1"], df["question2"]))):
        q1s[i], surplus1, numbers1 = prepare(q1)
        q2s[i], surplus2, numbers2 = prepare(q2)
        features[i, 0] = len(surplus1.intersection(surplus2))
        features[i, 1] = len(surplus1.union(surplus2))
        features[i, 2] = len(numbers1.intersection(numbers2))
        features[i, 3] = len(numbers1.union(numbers2))

    return q1s, q2s, features

In [9]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# Clean texts for all the question sentences in the training data
train["question1"] = train["question1"].fillna("").apply(preprocess)
train["question2"] = train["question2"].fillna("").apply(preprocess)

In [10]:
# Create the vocabulary of words that occurr more than specific times
print("Creating the vocabulary of words occurred more than", MIN_WORD_OCCURRENCE)
all_questions = pd.Series(train["question1"].tolist() + train["question2"].tolist()).unique()
vectorizer = CountVectorizer(lowercase=False, token_pattern="\S+", min_df=MIN_WORD_OCCURRENCE)
vectorizer.fit(all_questions)
top_words = set(vectorizer.vocabulary_.keys())
top_words.add(REPLACE_WORD)

# Generate the map of high frequency words' embeddings 
embeddings_index = get_embedding()

print("Words are not found in the embedding:", top_words - embeddings_index.keys())
top_words = embeddings_index.keys()

Creating the vocabulary of words occurred more than 100
Words are not found in the embedding: {'brexit', 'americaes', '$100', 'don’t', 'what’s', '100%', 'americaa', 'redmi', 'oneplus', 'americae', 'iisc', 'demonetisation', 'kvpy', 'quorans', 'i’m', '\\frac', '$1', 'paytm'}


In [11]:
print("Train questions are being prepared for LSTM...")
# Process the training data and extract some additional features named "q features" for the training data
q1s_train, q2s_train, train_q_features = extract_features(train)

tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(np.append(q1s_train, q2s_train))
word_index = tokenizer.word_index

# Turn training data questions into word sequences. Then Pad or truncate each word sequence to be the same length
data_1 = pad_sequences(tokenizer.texts_to_sequences(q1s_train), maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(tokenizer.texts_to_sequences(q2s_train), maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(train["is_duplicate"])

Train questions are being prepared for LSTM...


In [12]:
# Generate the word embedding matrix from the map of high frequency words' embeddings 
nb_words = len(word_index) + 1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Merge NLP features, Non-NLP features, and q features of the training data to get the final training data features
print("Train features are being merged with NLP and Non-NLP features...")
train_nlp_features = pd.read_csv("data/nlp_stemmed_features_train.csv")
train_non_nlp_features = pd.read_csv("data/non_nlp_features_train.csv")
features_train = np.hstack((train_q_features, train_nlp_features, train_non_nlp_features))

print("Same steps are being applied for test...")
# Clean texts for all the question sentences in the test data
test["question1"] = test["question1"].fillna("").apply(preprocess)
test["question2"] = test["question2"].fillna("").apply(preprocess)

# Process the test data and extract some additional features named "q features" for the test data
q1s_test, q2s_test, test_q_features = extract_features(test)

# Turn test data questions into word sequences. Then Pad or truncate each word sequence to be the same length
test_data_1 = pad_sequences(tokenizer.texts_to_sequences(q1s_test), maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(tokenizer.texts_to_sequences(q2s_test), maxlen=MAX_SEQUENCE_LENGTH)

# Merge NLP features, Non-NLP features, and q features of the test data to get the final test data features
test_nlp_features = pd.read_csv("data/nlp_stemmed_features_test.csv")
test_non_nlp_features = pd.read_csv("data/non_nlp_features_test.csv")
features_test = np.hstack((test_q_features, test_nlp_features, test_non_nlp_features))

Train features are being merged with NLP and Non-NLP features...
Same steps are being applied for test...


In [None]:
# A class instance used to divide the training data to do cross validation training
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True)
model_count = 0

# K-folds cross validation training loop
for idx_train, idx_val in skf.split(train["is_duplicate"], train["is_duplicate"]):
    print("MODEL:", model_count)
    # The question 1 word sequences of the training set
    data_1_train = data_1[idx_train]
    # The question 2 word sequences of the training set
    data_2_train = data_2[idx_train]
    # The labels of the training set
    labels_train = labels[idx_train]
    # The features of the training set
    f_train = features_train[idx_train]

    # The question 1 word sequences of the validation set
    data_1_val = data_1[idx_val]
    # The question 2 word sequences of the validation set
    data_2_val = data_2[idx_val]
    # The labels of the validation set
    labels_val = labels[idx_val]
    # The features of the validation set
    f_val = features_train[idx_val]
    
    # Word embedding layer
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    
    # LSTM layer
    lstm_layer = LSTM(75, recurrent_dropout=0.2)

    # The input question 1 word sequence
    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    # The word embeddings of question 1
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    # The sentense representation of question 1 generated by LSTM
    x1 = lstm_layer(embedded_sequences_1)
    
    # The input question 2 word sequence
    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    # The word embeddings of question 2
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    # The sentense representation of question 2 generated by LSTM
    y1 = lstm_layer(embedded_sequences_2)
    
    # The input features
    features_input = Input(shape=(f_train.shape[1],), dtype="float32")
    features_dense = BatchNormalization()(features_input)
    features_dense = Dense(200, activation="relu")(features_dense)
    features_dense = Dropout(0.2)(features_dense)
    
    # Add up the representations of question 1 and question 2
    addition = add([x1, y1])
    
    # Get the squared difference of the representations of question 1 and question 2
    minus_y1 = Lambda(lambda x: -x)(y1)
    merged = add([x1, minus_y1])
    merged = multiply([merged, merged])
    
    # Merge two sentence representations' addition and squared difference to get their interaction tensor
    merged = concatenate([merged, addition])
   
    merged = Dropout(0.4)(merged)

    # Merge two sentences' representations interaction and their other features
    merged = concatenate([merged, features_dense])

    # Then let the merged tensor pass through the following layers 
    merged = BatchNormalization()(merged)
    merged = GaussianNoise(0.1)(merged)
    merged = Dense(150, activation="relu")(merged)
    merged = Dropout(0.2)(merged)
    
    merged = BatchNormalization()(merged)
    
    # The output layer outputs the final prediction value
    out = Dense(1, activation="sigmoid")(merged)

    # Build the model
    model = Model(inputs=[sequence_1_input, sequence_2_input, features_input], outputs=out)
    model.compile(loss="binary_crossentropy",
                  optimizer="nadam")
    
    # The early stopping callback func which can stop the training if validation loss hasn't improved in specific rounds
    early_stopping = EarlyStopping(monitor="val_loss", patience=5)
    
    # The filepath where to save the best model weights that can produce the lowest validation loss in the current fold's training
    best_model_path = "best_model" + str(model_count) + ".h5"
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

    # Train the model using the prepared data
    hist = model.fit([data_1_train, data_2_train, f_train], labels_train,
                     validation_data=([data_1_val, data_2_val, f_val], labels_val),
                     epochs=15, batch_size=BATCH_SIZE, shuffle=True,
                     callbacks=[early_stopping, model_checkpoint], verbose=1)

    # Load the best model weights of the current fold's training
    model.load_weights(best_model_path)
    
    print(model_count, "validation loss:", min(hist.history["val_loss"]))

    # Use the trained model to predict the test data's prediction values
    preds = model.predict([test_data_1, test_data_2, features_test], batch_size=BATCH_SIZE, verbose=1)
    
    # Save the prediction results of the current fold to file
    submission = pd.DataFrame({"test_id": test["test_id"], "is_duplicate": preds.ravel()})
    submission.to_csv("predictions/dn_preds" + str(model_count) + ".csv", index=False)
    
    # Go to the next fold
    model_count += 1

MODEL: 0
Train on 363860 samples, validate on 40430 samples
Epoch 1/15