In [1]:
import pandas as pd
from tqdm import tqdm
import nltk
import numpy as np
import math
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional
from sklearn.model_selection import train_test_split 

tqdm.pandas()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv("./input/train.csv")
train_df, val_df = train_test_split(train, test_size=0.1,stratify=train['target'])
test = pd.read_csv("./input/test.csv")
print("Train shape : ",train_df.shape)
print("Val shape : ",val_df.shape)
print("Test shape : ",test.shape)
print(train_df.loc[train_df['target']==0].shape[0]/train_df.shape[0])
print(train_df.loc[train_df['target']==1].shape[0]/train_df.shape[0])
print(val_df.loc[val_df['target']==0].shape[0]/val_df.shape[0])
print(val_df.loc[val_df['target']==1].shape[0]/val_df.shape[0])

Train shape :  (1175509, 3)
Val shape :  (130613, 3)
Test shape :  (56370, 2)
0.9381297803759903
0.06187021962400968
0.938130201434773
0.06186979856522704


In [3]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [4]:
sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 1175509/1175509 [00:08<00:00, 140842.27it/s]
100%|██████████| 1175509/1175509 [00:05<00:00, 203108.14it/s]

{'Which': 42531, 'former': 627, 'presidents': 162, 'have': 75132, 'maintained': 95}





In [5]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [8]:
train_df["question_text"] = train_df["question_text"].copy().apply(lemmatize_text)
val_df["question_text"] = val_df["question_text"].copy().apply(lemmatize_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
embeddings_index = {}
f = open('./input/embedding/glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [03:48, 9590.74it/s] 

Found 2196016 word vectors.





In [10]:
# Convert values to embeddings
def text_to_array(text):
    empty_emb = np.zeros(300)
    text = text[:-1][:30]
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds+= [empty_emb] * (30 - len(embeds))
    return np.array(embeds)

# train_vects = [text_to_array(X_text) for X_text in tqdm(train_df["question_text"])]
val_vects = np.array([text_to_array(X_text) for X_text in tqdm(val_df["question_text"])])
val_y = np.array(val_df["target"])

100%|██████████| 130613/130613 [00:40<00:00, 3191.59it/s]


In [11]:
# Data providers
batch_size = 128

def batch_gen(train_df):
    n_batches = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            yield text_arr, np.array(train_df["target"][i*batch_size:(i+1)*batch_size])

In [12]:
model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True),
                        input_shape=(30, 300)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [15]:
mg = batch_gen(train)
model.fit_generator(mg, epochs=5,
                    steps_per_epoch=100,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1f16d392b0>

In [17]:
# prediction part
batch_size = 256
def batch_gen(test_df):
    n_batches = math.ceil(len(test_df) / batch_size)
    for i in range(n_batches):
        texts = test_df.iloc[i*batch_size:(i+1)*batch_size, 1]
        text_arr = np.array([text_to_array(text) for text in texts])
        yield text_arr

test_df = pd.read_csv("./input/test.csv")

all_preds = []
for x in tqdm(batch_gen(test_df)):
    all_preds.extend(model.predict(x).flatten())

221it [01:46,  2.18it/s]


In [18]:
y_te = (np.array(all_preds) > 0.5).astype(np.int)

submit_df = pd.DataFrame({"qid": test_df["qid"], "prediction": y_te})
submit_df.to_csv("submission.csv", index=False)