In [202]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from importlib import reload
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from sklearn.metrics import fbeta_score, make_scorer
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import sys
csv.field_size_limit(sys.maxsize)
#reload(sys)
#sys.setdefaultencoding('utf-8')

9223372036854775807

In [203]:
def preprocess_text(text):
    new_text = ""
    prev = ' '
    delete_1 = 0
    delete_2 = 0
    delete_3 = 0
    for i in enumerate(text):
        if (delete_1 <= 0 and delete_2 <= 0 and delete_3 <=0) and (i[1] != '{' and i[1] != '[' and i[1] != '<'):
            new_text =  new_text + i[1]
        if (i[1] == '[' and prev == '['):
            delete_2 += 1
        if (i[1] == '{' and prev == '{'):
            delete_1 += 1
        if i[1] == '}' and prev == '}':
            delete_1 -= 1
        if i[1] == ']' and prev == ']':
            delete_2 -= 1
            
        if i[1] == '<':
            delete_3 += 1
        if i[1] == '>':
            delete_3 -= 1
        prev = i[1]
    return new_text

In [159]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [160]:
########################################
## set directories and parameters
########################################
BASE_DIR = '~/projects/ML-Wikipedia-Runner/'
EMBEDDING_FILE = BASE_DIR + 'internal/dataset_generator/output/GoogleNews-vectors-negative300.bin'
TRAIN_DATA_FILE = '../dataset_generator/output/train.csv'
TEST_DATA_FILE =  '../dataset_generator/output/test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 400000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [161]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.key_to_index))

Indexing word vectors
Found 3000000 word vectors of word2vec


In [163]:

########################################
## process texts in datasets
########################################
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text

texts_1 = [] 
texts_2 = []
labels = []
names_1 = []
names_2 = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        names_1.append(values[0])
        texts_1.append(text_to_wordlist(preprocess_text(values[1]))[:1000])
        names_2.append(values[2])
        texts_2.append(text_to_wordlist(preprocess_text(values[3]))[:1000])
        labels.append(int(values[4]))
print('Found %s texts in train.csv' % len(texts_1))
test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(preprocess_text(values[1])))
        test_texts_2.append(text_to_wordlist(preprocess_text(values[3])))
        test_ids.append(values[4])
print('Found %s texts in test.csv' % len(test_texts_1))

Processing text dataset
Found 1004 texts in train.csv
Found 201 texts in test.csv


In [164]:


tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)


Found 63442 unique tokens
Shape of data tensor: (1004, 30)
Shape of label tensor: (1004,)


In [204]:
data_1.shape

(1004, 30)

In [165]:

########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.key_to_index:
        embedding_matrix[i] = word2vec.get_vector(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 34548


In [63]:
embedding_matrix.shape

(324555, 300)

In [166]:


########################################
## sample train/validation data
########################################
#np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.hstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.hstack((data_2[idx_train], data_1[idx_train]))
labels_train = labels[idx_train]

data_1_val = np.hstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.hstack((data_2[idx_val], data_1[idx_val]))
labels_val = labels[idx_val]

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344


In [177]:
# Training our model
params = {'learning_rate':0.01, 'n_estimators':600,
                    'nthread': 1, 'subsample': 0.6, 'min_child_weight': 1, 'max_depth': 5, 'gamma': 1.5, 'colsample_bytree': 0.8}
xgb_reg = XGBRegressor(**params)
#start_time = timer(None)
scores = cross_val_score(xgb_reg, data_1_train, labels_train, scoring=make_scorer(mean_squared_error),  cv=4)
print(scores.mean(), scores)
#timer(start_time) 


0.6229954650820686 [0.7362768  0.55888144 0.61938811 0.57743551]


In [197]:
xgb_reg.fit(data_1_train[:500], labels_train[:500])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=1.5, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=600, n_jobs=1, nthread=1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.6,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [200]:
mean_squared_error(xgb_reg.predict(np.hstack((test_data_1, test_data_2))), test_ids)

  return f(*args, **kwargs)


1.07297732980881

In [201]:
mean_squared_error(xgb_reg.predict(data_1_train[600:700]), labels_train[600:700])

0.4669602050665907

In [185]:
xgb_reg.predict(np.hstack((test_data_1, test_data_2)))

array([3.1303465, 3.0433836, 3.136864 , 2.688777 , 2.9511845, 3.0433836,
       3.1653183, 2.5575051, 2.7755616, 2.7755616, 2.9840698, 2.2381618,
       3.0725186, 2.6757746, 2.7151797, 2.7314122, 2.6757746, 2.7151797,
       2.6008651, 2.6166809, 2.6757746, 2.259337 , 2.0247102, 2.6867118,
       2.5714781, 3.0881166, 2.6867118, 3.130042 , 2.7138758, 2.7257798,
       3.3675303, 3.050821 , 3.4508774, 3.2472515, 2.8035843, 2.6080627,
       2.860757 , 2.6160955, 3.3675303, 2.8035843, 2.6080627, 2.6080627,
       2.1389406, 2.6442332, 2.7106433, 2.8237283, 2.7888672, 2.6442332,
       2.6442332, 2.8445463, 3.2973542, 3.2265408, 2.8172848, 2.6606052,
       2.686128 , 2.831532 , 3.5706036, 3.120156 , 3.1991723, 2.771214 ,
       2.5169132, 2.8323712, 3.1766186, 3.5706036, 3.120156 , 2.5169132,
       2.5169132, 2.520507 , 2.7403638, 2.4195626, 2.6093159, 2.520507 ,
       2.7403638, 2.451008 , 2.520507 , 2.6591842, 3.0316966, 2.4128747,
       2.581084 , 1.9490469, 2.6683645, 2.783653 , 

In [187]:
test_ids

array(['2', '4', '3', '3', '1', '4', '3', '2', '3', '3', '2', '2', '1',
       '4', '3', '3', '4', '3', '3', '1', '4', '2', '2', '2', '2', '2',
       '2', '2', '2', '2', '5', '4', '3', '2', '4', '3', '3', '2', '5',
       '4', '3', '3', '1', '3', '2', '1', '2', '3', '3', '2', '4', '3',
       '3', '4', '3', '3', '5', '4', '3', '4', '3', '3', '2', '5', '4',
       '3', '3', '4', '3', '3', '1', '4', '3', '3', '4', '2', '1', '1',
       '4', '3', '3', '3', '4', '3', '3', '4', '3', '3', '4', '1', '2',
       '2', '1', '2', '2', '5', '4', '3', '4', '3', '3', '5', '4', '4',
       '3', '2', '3', '2', '3', '3', '3', '3', '3', '2', '1', '3', '1',
       '3', '4', '3', '3', '4', '3', '3', '4', '6', '5', '4', '3', '5',
       '4', '3', '4', '3', '3', '6', '4', '3', '3', '2', '3', '2', '5',
       '4', '3', '2', '4', '3', '3', '1', '5', '4', '3', '3', '2', '3',
       '3', '3', '3', '3', '2', '2', '2', '2', '3', '3', '2', '3', '2',
       '3', '3', '2', '1', '1', '2', '1', '2', '5', '4', '3', '4

In [182]:
labels_val

array([2, 2, 4, 3, 4, 4, 3, 3, 2, 3, 3, 2, 4, 4, 3, 3, 2, 5, 2, 3, 4, 2,
       3, 4, 6, 3, 3, 3, 3, 3, 3, 4, 3, 1, 2, 1, 3, 4, 3, 3, 2, 3, 3, 3,
       2, 2, 2, 4, 4, 3, 2, 4, 3, 3, 2, 4, 1, 4, 4, 1, 2, 4, 4, 3, 3, 3,
       1, 2, 3, 3, 3, 3, 3, 3, 5, 2, 5, 4, 3, 3, 3, 3, 3, 3, 3, 2, 3, 5,
       3, 3, 3, 3, 3, 4, 2, 2, 4, 5, 3, 3, 2])

In [175]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH*2,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH*2,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

#preds = Dense(1, activation='sigmoid')(merged)
preds = Dense(1, activation='ReLU')(merged)



In [212]:
labels_val

array([3, 3, 3, 3, 2, 3, 2, 2, 3, 3, 4, 3, 1, 5, 4, 2, 3, 5, 3, 4, 3, 3,
       3, 3, 4, 2, 3, 1, 5, 1, 3, 1, 2, 2, 4, 3, 3, 1, 3, 3, 4, 2, 2, 3,
       2, 4, 2, 6, 4, 2, 1, 3, 3, 2, 5, 4, 2, 3, 3, 5, 3, 3, 2, 5, 3, 1,
       3, 5, 3, 2, 3, 5, 2, 2, 4, 1, 3, 2, 2, 3, 1, 2, 4, 5, 2, 3, 3, 3,
       2, 3, 4, 3, 4, 1, 4, 3, 4, 3, 4, 2, 4, 3, 3, 3, 3, 2, 3, 2, 2, 3,
       3, 4, 3, 1, 5, 4, 2, 3, 5, 3, 4, 3, 3, 3, 3, 4, 2, 3, 1, 5, 1, 3,
       1, 2, 2, 4, 3, 3, 1, 3, 3, 4, 2, 2, 3, 2, 4, 2, 6, 4, 2, 1, 3, 3,
       2, 5, 4, 2, 3, 3, 5, 3, 3, 2, 5, 3, 1, 3, 5, 3, 2, 3, 5, 2, 2, 4,
       1, 3, 2, 2, 3, 1, 2, 4, 5, 2, 3, 3, 3, 2, 3, 4, 3, 4, 1, 4, 3, 4,
       3, 4, 2, 4])

In [176]:


########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='mse',
        optimizer='nadam',
        metrics=['mse'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_mse', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        callbacks=[early_stopping, model_checkpoint])





lstm_179_135_0.35_0.27
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200


In [221]:
model.predict()

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 1,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       2, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 1, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 1, 3,
       3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 1, 3, 3, 3,
       3, 3, 3, 3])

In [None]:


########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])



In [242]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2], batch_size=2048, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=2048, verbose=1)
preds /= 2



Start making the submission before fine-tuning


In [244]:
preds.shape

(201, 8)

In [238]:
test_ids.shape

(201,)

In [240]:
preds

array([[2.6496637, 2.769687 , 2.889439 , ..., 2.766019 , 2.6478813,
        2.1968107],
       [2.9501204, 3.0554934, 3.156005 , ..., 2.9867256, 2.8402689,
        2.4281836],
       [2.530469 , 2.6903658, 2.7935371, ..., 2.591871 , 2.4303348,
        2.0488334],
       ...,
       [3.5676847, 3.678516 , 3.6293206, ..., 3.559912 , 3.4132476,
        3.023573 ],
       [3.259972 , 3.3953993, 3.2830296, ..., 3.1620235, 2.98708  ,
        2.686414 ],
       [3.0544474, 3.1235294, 3.1655955, ..., 2.9627635, 2.891192 ,
        2.456123 ]], dtype=float32)

In [250]:
preds = model.predict([test_data_1, test_data_2])

In [249]:
preds.shape

(201, 8)

In [256]:
preds.min(axis=1)

AttributeError: 'KerasTensor' object has no attribute 'min'

# Save model