In [1]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [3]:

train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018) 
embed_size = 300
max_features = 50000
maxlen = 100 
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)
train_y = train_df['target'].values
val_y = val_df['target'].values

In [4]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total para

In [5]:

model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3617f83668>

In [6]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5711791779082684
F1 score at threshold 0.11 is 0.5781787289129819
F1 score at threshold 0.12 is 0.5855770420154208
F1 score at threshold 0.13 is 0.5918898497187473
F1 score at threshold 0.14 is 0.5967095729264342
F1 score at threshold 0.15 is 0.6017385287376205
F1 score at threshold 0.16 is 0.6071992976294995
F1 score at threshold 0.17 is 0.611699759593981
F1 score at threshold 0.18 is 0.615829157072554
F1 score at threshold 0.19 is 0.6201507882111035
F1 score at threshold 0.2 is 0.6242480333179083
F1 score at threshold 0.21 is 0.6279320192892925
F1 score at threshold 0.22 is 0.6303236797274276
F1 score at threshold 0.23 is 0.6332075832099708
F1 score at threshold 0.24 is 0.6365963200849954
F1 score at threshold 0.25 is 0.6384825433976984
F1 score at threshold 0.26 is 0.6411973217802285
F1 score at threshold 0.27 is 0.6433503899845994
F1 score at threshold 0.28 is 0.645426309988979
F1 score at threshold 0.29 is 0.6469043530956469
F1 score at threshold 0.3

In [7]:
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)`

SyntaxError: invalid syntax (<ipython-input-7-eabb059634f9>, line 1)