In [1]:
from nlp_tweets.preprocess import clean_text, get_sequences, get_embedding_matrix
from nlp_tweets.model import get_model
from nlp_tweets.eda import plot_roc_curve_mean, get_categorial_confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yannhallouard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
PATH_RAW = '../data/raw/'
PATH_PROCESSED = '../data/processed/'

TRAIN_COLUMNS = ['id', 'text', 'target']
TEST_COLUMNS = ['id', 'text']
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100

# Loading

In [37]:
tweets_df = pd.read_csv(PATH_RAW + 'train.csv')
tweets_df = tweets_df[TRAIN_COLUMNS]
tweets_df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [38]:
tweets_test_df = pd.read_csv(PATH_RAW + 'test.csv')
tweets_test_df = tweets_test_df[TEST_COLUMNS]

# Preprocessing

In [39]:
tweets_df['text'] = tweets_df['text'].map(lambda x: clean_text(x))

In [40]:
data, tokenizer = get_sequences(vocabulary_size=MAX_NB_WORDS, maxlen=MAX_SEQUENCE_LENGTH, texts=tweets_df['text'])

In [41]:
test_data, _ = get_sequences(vocabulary_size=MAX_NB_WORDS, maxlen=MAX_SEQUENCE_LENGTH, 
                          texts=tweets_test_df['text'], tokenizer=tokenizer)

In [42]:
pretrained_embeddings_path = PATH_RAW + "GoogleNews-vectors-negative300.bin"
word2vec = KeyedVectors.load_word2vec_format(pretrained_embeddings_path, binary=True)

In [43]:
embedding_matrix = get_embedding_matrix(tokenizer=tokenizer, 
                                        word2vec=word2vec, 
                                        max_nb_words=MAX_NB_WORDS, 
                                        enbedding_dim=EMBEDDING_DIM)

# Train test split

In [44]:
X_train, X_val, y_train, y_val = train_test_split(data, tweets_df['target'], test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val)

print(X_train.shape, X_val.shape, X_test.shape)

(4567, 100) (2284, 100) (762, 100)


# Create Model

In [63]:
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, GRU, TimeDistributed, \
    Embedding, BatchNormalization, Flatten
from tensorflow.keras.models import Model
import tensorflow
def get_model(vocabulary_size: int, embeding_dim: int, max_sequence_length: int,
              _rnn_nb: [int], _fc_nb: [int], dropout_rate: float) -> tensorflow.keras.models.Model:
    sequence_1_input = Input(shape=(max_sequence_length,), dtype='int32')

    seq = Embedding(vocabulary_size, embeding_dim, input_length=max_sequence_length, trainable=False)(sequence_1_input)
    for _r in _rnn_nb:
        seq = GRU(_r, activation='tanh', dropout=dropout_rate,
                  recurrent_dropout=dropout_rate, return_sequences=True)(seq)

    for _f in _fc_nb:
        seq = TimeDistributed(Dense(_f))(seq)
        seq = Dropout(dropout_rate)(seq)
        seq = TimeDistributed(Dense(_f))(seq)

    seq = Flatten()(seq)
    seq = Dense(100)(seq)
    seq = Activation('tanh')(seq)
    seq = Dropout(dropout_rate)(seq)
    seq = BatchNormalization()(seq)
    seq = Dense(10)(seq)
    seq = Activation('tanh')(seq)
    seq = Dropout(dropout_rate)(seq)
    seq = BatchNormalization()(seq)
    seq = Dense(1)(seq)
    out = Activation('sigmoid', name='strong_out')(seq)

    model = Model(inputs=sequence_1_input, outputs=out)
    model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [64]:
model = get_model(vocabulary_size=MAX_NB_WORDS, embeding_dim=EMBEDDING_DIM, 
                  max_sequence_length=MAX_SEQUENCE_LENGTH,_rnn_nb=[128, 64, 32], _fc_nb=[32], dropout_rate=0.5) 
model.summary()

Model: "functional_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 100, 300)          6000000   
_________________________________________________________________
gru_17 (GRU)                 (None, 100, 128)          165120    
_________________________________________________________________
gru_18 (GRU)                 (None, 100, 64)           37248     
_________________________________________________________________
gru_19 (GRU)                 (None, 100, 32)           9408      
_________________________________________________________________
time_distributed_14 (TimeDis (None, 100, 32)           1056      
_________________________________________________________________
dropout_14 (Dropout)         (None, 100, 32)         

# Train

In [None]:
model.fit(
    X_train,
    y_train,
    batch_size=64,
    epochs=100,
    validation_data=(X_val, y_val),
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100

## Finding the best threshold

In [None]:
pred_proba_train = model.predict(X_train)
pred_proba_val = model.predict(X_val)

In [None]:
threshold = plot_roc_curve_mean(y_train=y_train.values, y_train_pred_proba=pred_proba_train, 
                        y_test=y_val.values, y_test_pred_proba=pred_proba_val)

## Test

In [None]:
model.evaluate(X_test, y_test)

In [None]:
pred_proba_test = model.predict(X_test).reshape(-1)
pred_proba_test = (pred_proba_test > threshold).astype(int)

In [None]:
get_categorial_confusion_matrix(y_test, pred_proba_test, display=True)

# Predict

In [None]:
predictions = model.predict(test_data)
predictions = (predictions > threshold).astype(int)

In [None]:
output = pd.read_csv(PATH_RAW + 'sample_submission.csv')
output['target'] = predictions
output.to_csv(PATH_PROCESSED + 'to_submit.csv', index=False)