In [21]:
import pandas as pd
import numpy as np
import regex as re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.regexp import RegexpStemmer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing .sequence import pad_sequences

from tensorflow import keras
from tensorflow.keras.layers import Conv1D, BatchNormalization, Dropout, Dense, MaxPooling1D, Activation, Flatten, LSTM, GRU,Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [29]:
TRAIN_PATH = '../input/nlp-getting-started/train.csv'
TEST_PATH = '../input/nlp-getting-started/test.csv'

RULE_DIGITS = r'[0-9]+'
RULE_MENTIONS = r'@mention'
RULE_HTTP = r'https?:\/\/\S+'
RULE_URLS = r"www.\[a-z]?\.?(com)+|[a-z]+\.(com)"
RULE_SYMBOLS = r"[_\,\>\(\-:\)\\\/\!\.\?\@\$\];='#]"

STEMMER_RULE = 'ing$|s$|es$|able$|d$'

In [30]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

y = train_data['target']
train = train_data['text']
test = test_data['text']

# data preprocessing
train = train.str.lower()
train = train.apply(lambda x: re.sub(RULE_DIGITS,'',x))
train = train.apply(lambda x: re.sub(RULE_MENTIONS,'',x))
train = train.apply(lambda x: re.sub(RULE_HTTP, '',x))
train = train.apply(lambda x: re.sub(RULE_URLS, '',x))
train = train.apply(lambda x: re.sub(RULE_SYMBOLS,'',x))

test = test.str.lower()
test = test.apply(lambda x: re.sub(RULE_DIGITS,'',x))
test = test.apply(lambda x: re.sub(RULE_MENTIONS,'',x))
test = test.apply(lambda x: re.sub(RULE_HTTP, '',x))
test = test.apply(lambda x: re.sub(RULE_URLS, '',x))
test = test.apply(lambda x: re.sub(RULE_SYMBOLS,'',x))

tokenizer = TreebankWordTokenizer()
stemmer = RegexpStemmer(STEMMER_RULE, min=5) # Removes morphological affixes
token = train.apply(tokenizer.tokenize)
words = [[stemmer.stem(tok) for tok in tokens] for tokens in token]
train = [' '.join(word) for word in words]
print(train[:9])

token1 = test.apply(tokenizer.tokenize)
words1 = [[stemmer.stem(tok) for tok in tokens] for tokens in token1]
test = [' '.join(word) for word in words1]
print(test[:9])

In [31]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

# define the model
def define_model(vocab_size, max_length):
    model=keras.Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Bidirectional(GRU(32,dropout=0.2,recurrent_dropout=0.1,return_sequences=True)))
    model.add(Bidirectional(GRU(32,dropout=0.2,recurrent_dropout=0.1)))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [32]:
tokenizer = create_tokenizer(train)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
max_length = max([len(s.split()) for s in train])
print('Maximum length: %d' % max_length)
Xtrain = encode_docs(tokenizer, max_length, train)
print(Xtrain.shape)

tokenizer1 = create_tokenizer(test)
vocab_size1 = len(tokenizer1.word_index) + 1
print('Vocabulary size: %d' % vocab_size1)
max_length1 = max([len(s.split()) for s in test])
print('Maximum length: %d' % max_length1)
test = encode_docs(tokenizer1, max_length1, test)
print(test.shape)

Xtrain,valid_x,y,valid_y=train_test_split(Xtrain,y,test_size=0.25)

model = define_model(vocab_size, max_length)
early_stopping = keras.callbacks.EarlyStopping(
    patience=20,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model.fit(
    Xtrain, y,
    validation_data=(valid_x, valid_y),
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0, # hide the output because we have so many epochs
)

history_df = pd.DataFrame(history.history)
# Start to plot at epoch 5
history_df.loc[5:, ['loss', 'val_loss']].plot()
history_df.loc[5:, ['accuracy', 'val_accuracy']].plot()
plt.show()

print(("Best Loss: {:0.4f}" + "\nBest Accuracy: {:0.4f}")\
        .format(history_df['loss'].min(),history_df['accuracy'].max()))
print(("Best Validation Loss: {:0.4f}" + "\nBest Validation Accuracy: {:0.4f}")\
        .format(history_df['val_loss'].min(),history_df['val_accuracy'].max()))

In [33]:
ids = test_data['id']
test_pred = model.predict(test)
test_pred = [round(x[0]) for x in test_pred]
test_pred = list(zip(ids,test_pred))
result = pd.DataFrame(test_pred,columns = ['id','target'])
result = result.set_index('id').to_csv('./pred.csv')