# Real or Not?  NLP with Disaster Tweets with Deep Learning

Some resources I used:
 - https://www.kaggle.com/philculliton/nlp-getting-started-tutorial
 - https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 500)
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## 1. Data Cleanup

In [2]:
print (f'Train has {len(train_df)} records\nTest has {len(test_df)} records\n')

null_train_keyword = train_df['keyword'].isnull().sum() / len(train_df) * 100
null_test_keyword = test_df['keyword'].isnull().sum() / len(test_df) * 100
null_train_location = train_df['location'].isnull().sum() / len(train_df) * 100
null_test_location = test_df['location'].isnull().sum() / len(test_df) * 100

print (f'Keyword Values: Train = {round(null_train_keyword,3)}% Test = {round(null_test_keyword,3)}%')
print (f'Location Values: Train = {round(null_train_location,3)}% Test = {round(null_test_location,3)}%')

Train has 7613 records
Test has 3263 records

Keyword Values: Train = 0.801% Test = 0.797%
Location Values: Train = 33.272% Test = 33.865%


In [3]:
# Let's see if we can use a '#' value for our keyword...
null_train_keyword_df = train_df[train_df['keyword'].isnull()]['text'].str.contains('#')

print(f'Train contains {len(null_train_keyword_df)} records with no keywords')
print(f'  {null_train_keyword_df.sum()} of which don\'t have any \'#\'')

Train contains 61 records with no keywords
  21 of which don't have any '#'


In [4]:
# Testing area:
eyes = r"[8:=;]"
nose = r"['`\-]?"
df = pd.DataFrame({'test': ['This is ALL CAPS too longggg']})
df['test'].str.replace(r" ([A-Z -_]{2,}) ", r' \1 <allcaps> ')

0    This is ALL CAPS <allcaps> too longggg
Name: test, dtype: object

In [5]:
eyes = r"[8:=;]"
nose = r"['`\-]?"
key_words = ['user', 'number', 'hashtag', 'repeat', 'smile', 
             'lolface', 'sadface', 'neutralface', 'heart',
             'elong', 'allcaps', 'url']

all_data = [train_df, test_df]

for df in all_data:
    # Replace websites URLs
    df['text'] = df['text'].str.replace('http\S+|www.\S+', '<url>', case=False)
    # Replace usernames
    df['text'] = df['text'].str.replace('@\S+', ' <user>')
    # Remove encodings like &amp; and &gt;
    df['text'] = df['text'].str.replace('&\S+;', '') # not used in GloVe
    # Replace numbers
    df['text'] = df['text'].str.replace(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    # Replace hashtags
    df['text'] = df['text'].str.replace('#', '<hashtag> ')
    # Replace repeat !! ?? (not words)
    #df['text'] = df['text'].str.replace(r'(?<!\S)((\S+))(?:\s+\2)+(?!\S)', r'\1 <repeat>') # words: my misunderstanding
    df['text'] = df['text'].str.replace(r"([!?.]){2,}", r"\1 <repeat>")
    # Replace emoticons
    df['text'] = df['text'].str.replace(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    df['text'] = df['text'].str.replace(r"{}{}p+".format(eyes, nose), "<lolface>")
    df['text'] = df['text'].str.replace(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    df['text'] = df['text'].str.replace(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    df['text'] = df['text'].str.replace(r"<3","<heart>")
    # Elongated words like wayyyyy too longgg
    df['text'] = df['text'].str.replace(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    # ALL CAPS
    df['text'] = df['text'].str.replace(r" ([A-Z -_]{2,}) ", r' \1 <allcaps> ')
    # Remove *
    df['text'] = df['text'].str.replace(r"\*", r'')

In [6]:
xtrain_full = train_df['text']
xtest_full = test_df['text']
ytrain_full = train_df['target']
xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.text.values, ytrain_full, 
                                                  stratify=ytrain_full, 
                                                  random_state=42, 
                                                  test_size=0.333, shuffle=True)

print(f'Train = {len(xtrain)} records\nValidate = {len(xvalid)} records')

Train = 5077 records
Validate = 2536 records


## 2. Global Vectors

In [7]:
from tqdm.notebook import tqdm  # This is an awesome library that shows the progress of whatever tqdm() is applied to

#with open('D:\Datasets\GloVe\glove.twitter.27B.25d.txt', 'r', encoding="utf8") as f:
with open('D:\Datasets\GloVe\glove.twitter.27B.200d.txt', 'r', encoding="utf8") as f:
    embeddings_index = {}
    for line in tqdm(f):
        vals = line.rstrip().split(' ')
        embeddings_index[vals[0]] = [float(x) for x in vals[1:]]
print('Found %s word vectors.' % len(embeddings_index))  #1193514 

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Found 1193514 word vectors.


In [8]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import re

def new_tokenize(s):
    # This will regroup key_words like '<', 'hashtag', '>' into '<hashtag>'
    words = word_tokenize(s)
    new_words = []
    skip = 0
    for i, w in enumerate(words):
        if skip > 0:
            skip = skip-1
        else:
            if w == '<' and words[i+1] in key_words and words[i+2] == '>':
                new_words.append('<' + words[i+1] + '>')
                skip = 2
            else:
                new_words.append(w)
    return new_words
            

def sent2vec(s):
    words = str(s).lower()
    words = new_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha() or re.match("<\S+>",w)]
    M = []
    for w in words:
        try:
            # This adds the np.array (size=25) of values from the GloVe file for each word to the matrix
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    
    # Now we sum up each column to create a vector of size 25
    v = M.sum(axis=0)
    
    if type(v) != np.ndarray:
        return np.zeros(300)
    
    # I don't understand what the heck this is doing...
    return v / np.sqrt((v ** 2).sum())


# Print an example to show what this does...
s = xtrain[1973]
print (s)
print (word_tokenize(s))
print (new_tokenize(s))
print (sent2vec(s))

screams internally
['screams', 'internally']
['screams', 'internally']
[ 6.70529390e-02  4.43065556e-02  1.49141888e-02  7.36671104e-02
 -1.34542076e-02  1.14275995e-01  1.26520787e-01  1.63734215e-01
  5.48288245e-04 -8.43475034e-02 -1.92787878e-02 -2.65030936e-02
 -1.26914694e-01 -4.21059174e-02  8.09426895e-03 -1.06641128e-01
  4.30478317e-02 -5.17430657e-02 -3.90803207e-02 -1.33606430e-02
  4.97997301e-02  3.16750330e-02  3.06803763e-02 -2.51013095e-02
  1.32639909e-02  3.05011067e-01 -2.26566550e-03 -1.80953834e-02
 -4.70629671e-02  2.87402219e-02 -8.27007674e-02  6.32664753e-02
  3.09941918e-02 -1.90176491e-02 -1.21079073e-01 -6.06447967e-02
 -3.78739929e-02 -2.74410781e-02  1.02910710e-01  4.38003714e-02
  1.67318672e-01 -5.89269517e-02 -2.62017222e-02  1.21699406e-01
 -1.46393897e-01  6.78174549e-02  7.50682395e-02  4.48230318e-02
 -6.71260129e-02  1.42124547e-02 -8.80199119e-02  7.07095350e-02
  1.15412804e-02  1.74016958e-02  2.89707930e-02 -1.28155360e-02
 -1.01446424e-01  8

In [9]:
from tqdm.notebook import tqdm

xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]
xtrain_full_glove = [sent2vec(x) for x in tqdm(xtrain_full)]
xtest_full_glove = [sent2vec(x) for x in tqdm(xtest_full)]

HBox(children=(FloatProgress(value=0.0, max=5077.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2536.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7613.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3263.0), HTML(value='')))




In [10]:
xtrain_glove_np = np.stack(xtrain_glove)
xvalid_glove_np = np.stack(xvalid_glove)
xtrain_full_glove_np = np.stack(xtrain_full_glove)
xtest_full_glove_np = np.stack(xtest_full_glove)

## 3. Deep Learning

In [11]:
from sklearn import preprocessing
from keras import callbacks
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization

# scale the data before any neural net:
scl = preprocessing.StandardScaler()

xtrain_glove_scl = scl.fit_transform(xtrain_glove_np)
xvalid_glove_scl = scl.transform(xvalid_glove_np)
xtrain_glove_full_scl = scl.fit_transform(xtrain_full_glove_np)
xtest_glove_full_scl = scl.transform(xtest_full_glove_np)

In [12]:
#Source: https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [13]:
input_size = xtrain_glove_scl.shape[1]

In [14]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)
ytrain_full_enc = np_utils.to_categorical(ytrain_full)

In [15]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=input_size, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())

model.add(Dense(2))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc',f1_m,precision_m, recall_m])

## 3.1 Smaller Set with Cross-Validation

In [80]:
early_stop = callbacks.EarlyStopping(monitor='val_f1_m', patience=10)

model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=500, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc),
          callbacks=[early_stop])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500


<tensorflow.python.keras.callbacks.History at 0x1f2c303d588>

In [81]:
from sklearn import metrics

print(f'TFIDF Result:\t\t{metrics.f1_score(model.predict(xvalid_glove_scl).argmax(axis=1), yvalid)}')  # Result: 0.7081524360829716

TFIDF Result:		0.779270633397313


## 3.2 Full Set with no Cross-Validation

In [16]:
early_stop = callbacks.EarlyStopping(monitor='val_f1_m', patience=10)

model.fit(xtrain_glove_full_scl, y=ytrain_full_enc, batch_size=64, 
          epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x29bedaad688>

In [19]:
predictions = model.predict(xtest_glove_full_scl).argmax(axis=1)
output = pd.DataFrame({'id': test_df.id, 'target': predictions})
output.to_csv('my_submission.csv', index=False)