In [104]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
import os

def get_glove_vectors(filename="data/glove.6B.200d.txt"):
    ## function from https://campus.datacamp.com/courses/recurrent-neural-networks-for-language-modeling-in-python/rnn-architecture?ex=7
    # Get all word vectors from pre-trained model
    glove_vector_dict = {}
    with open(filename, encoding="UTF-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = values[1:]
            glove_vector_dict[word] = np.asarray(coefs, dtype='float32')
    return glove_vector_dict

In [105]:
print(tf.__version__)
print(keras.__version__)

2.9.1
2.9.0


In [106]:
import time
start = time.time()

glove_vector_dict = get_glove_vectors()

end = time.time()
print(f'elapsed seconds = {end - start}')
type(glove_vector_dict)

elapsed seconds = 19.347051858901978


dict

In [107]:
glove_vector_dict['the']

array([-7.1549e-02,  9.3459e-02,  2.3738e-02, -9.0339e-02,  5.6123e-02,
        3.2547e-01, -3.9796e-01, -9.2139e-02,  6.1181e-02, -1.8950e-01,
        1.3061e-01,  1.4349e-01,  1.1479e-02,  3.8158e-01,  5.4030e-01,
       -1.4088e-01,  2.4315e-01,  2.3036e-01, -5.5339e-01,  4.8154e-02,
        4.5662e-01,  3.2338e+00,  2.0199e-02,  4.9019e-02, -1.4132e-02,
        7.6017e-02, -1.1527e-01,  2.0060e-01, -7.7657e-02,  2.4328e-01,
        1.6368e-01, -3.4118e-01, -6.6070e-02,  1.0152e-01,  3.8232e-02,
       -1.7668e-01, -8.8153e-01, -3.3895e-01, -3.5481e-02, -5.5095e-01,
       -1.6899e-02, -4.3982e-01,  3.9004e-02,  4.0447e-01, -2.5880e-01,
        6.4594e-01,  2.6641e-01,  2.8009e-01, -2.4625e-02,  6.3302e-01,
       -3.1700e-01,  1.0271e-01,  3.0886e-01,  9.7792e-02, -3.8227e-01,
        8.6552e-02,  4.7075e-02,  2.3511e-01, -3.2127e-01, -2.8538e-01,
        1.6670e-01, -4.9707e-03, -6.2714e-01, -2.4904e-01,  2.9713e-01,
        1.4379e-01, -1.2325e-01, -5.8178e-02, -1.0290e-03, -8.21

In [108]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [109]:
## Previous models ignored the keyword column in the training and test data.

## To incorporate the keyword, we will try just appending the keywords (when present) as
## an additional one or two tokens at the end of each tweet text.

## This function modifies the pandas dataframe df by
## appending the keyword (if present) to the end of the tweet.
## Keywords like 'airplane%20accident' are split into two words.
## It also writes the modified dataframe to a csv file (for debugging).
def add_keyword(df, filename=None):
    df.loc[df['keyword'].isna()==False,'text'] = df['text'] + ' ' + df['keyword'].str.replace('%20',' ')
    if filename:
        df.to_csv(f'data/{filename}',index=False)

add_keyword(df_train, filename='df_train.csv')
add_keyword(df_test, filename='df_test.csv')

In [110]:
s1 = df_train[df_train['id']==232]
print(s1)
s2 = df_train.iloc[161,3]
s2

      id              keyword   location  \
161  232  airplane%20accident  Havenford   

                                                  text  target  
161  + Nicole Fletcher one of a victim of crashed a...       1  


"+ Nicole Fletcher one of a victim of crashed airplane few times ago. \n\nThe accident left a little bit trauma for her. Although she's \n\n+ airplane accident"

In [111]:
df_train['target'].agg(sum)/len(df_train)

0.4296597924602653

In [112]:
import re

def clean_up_tweet(tweet):
    """
    Clean up the content of one tweet, removing punctuation and numbers. 
    
    Parameters:
    tweet(str):The text of the tweet
    
    Returns:
    word_list: A list of pure alphabetic words in lower case
    
    """
    ## Remove all characters execept alphabetic chars and space,
    ## convert to lower case and split on space.
    word_list = re.sub('[^A-Za-z ]+','',tweet).lower().split(' ')
    return word_list
    

In [113]:
clean_up_tweet(s2)

['',
 'nicole',
 'fletcher',
 'one',
 'of',
 'a',
 'victim',
 'of',
 'crashed',
 'airplane',
 'few',
 'times',
 'ago',
 'the',
 'accident',
 'left',
 'a',
 'little',
 'bit',
 'trauma',
 'for',
 'her',
 'although',
 'shes',
 '',
 'airplane',
 'accident']

In [114]:
train, valid = train_test_split(df_train, train_size=0.8, shuffle=True, random_state=42)
print(train.shape)
print(valid.shape)

(6090, 5)
(1523, 5)


In [115]:
train_x = train['text'].map(clean_up_tweet)
valid_x = valid['text'].map(clean_up_tweet)
test_x = df_test['text'].map(clean_up_tweet)

print(train_x.shape)
print(valid_x.shape)
print(test_x.shape)
type(train_x)

(6090,)
(1523,)
(3263,)


pandas.core.series.Series

In [116]:
train_y = np.array(train['target'], dtype=np.float32)
valid_y = np.array(valid['target'], dtype=np.float32)
print(train_y[:5])
print(train_y[-5:])
print(np.sum(train_y)/len(train_y))
print(np.sum(valid_y)/len(valid_y))

[1. 0. 1. 1. 0.]
[0. 0. 0. 1. 1.]
0.43054187192118226
0.4261326329612607


In [117]:
train['tweet_word_counts'] = [len(x) for x in train_x]
valid['tweet_word_counts'] = [len(x) for x in valid_x]
df_test['tweet_word_counts'] = [len(x) for x in test_x]
print(np.max(train['tweet_word_counts']) )
print(np.max(valid['tweet_word_counts']) )
print(np.max(df_test['tweet_word_counts']) )

55
31
35


In [118]:
def glove_word_embeddings(word_lists, pad_to=56):
    ## We plan to replace all the words in the tweets
    ## with embeddings from the GloVe dictionary, skipping
    ## any words not found, and also padding the sequence 
    ## of embeddings to a fixed length.
    
    ## If none of the words match for a given tweet we will substitute
    ## a with place holder vector of one word, "neutral".
    d = glove_vector_dict
    neutral = d["neutral"]
    placeHolder = np.array([neutral])
    padNeutral = pad_sequences(placeHolder.T, pad_to, dtype='float32')
    ## print('padNeutral',padNeutral)
    outer = []
    for word_list in word_lists:
        enc_list = []
        for word in word_list:
            ## print(word)
            if(type(d.get(word)) is np.ndarray):
                enc_list.append(d.get(word))
                ## print(d.get(word))
        if(len(enc_list) > 0):
            enc_array = np.array(enc_list)
#             print('shape: ',enc_array.shape)
#             print('enc_array = ',enc_array)
#             print('enc_array.T', enc_array.T)
            pad = pad_sequences(enc_array.T, pad_to, dtype='float32')
#             print('pad',pad)
            outer.append(pad.T)
#             print('outer shape',outer.shape)
#             print('outer', outer)
        else:
            outer.append(padNeutral.T)
    return np.array(outer)

In [119]:
print (train_x.shape)
print (valid_x.shape)
print (test_x.shape)

(6090,)
(1523,)
(3263,)


In [120]:

start = time.time()
X_train = glove_word_embeddings(train_x)
X_valid = glove_word_embeddings(valid_x)
X_test = glove_word_embeddings(test_x)
end = time.time()
print(f'elapsed seconds = {end - start}')
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

elapsed seconds = 4.336355447769165
(6090, 56, 200)
(1523, 56, 200)
(3263, 56, 200)


In [125]:
DROPOUT = 0.2
UNITS_PER_LAYER = 64

## Try switching to a Bidirectional LSTM model, as in this example
## https://keras.io/examples/nlp/bidirectional_lstm_imdb/

from tensorflow import keras
from tensorflow.keras import layers

inputs = keras.Input(shape=(None, 200) )
x = layers.Bidirectional(GRU(units=UNITS_PER_LAYER, return_sequences=True, dropout=DROPOUT))(inputs)
x = layers.Bidirectional(GRU(units=UNITS_PER_LAYER, return_sequences=True, dropout=DROPOUT))(x)
x = layers.Bidirectional(GRU(units=UNITS_PER_LAYER, return_sequences=False, dropout=DROPOUT))(x)
# Add a classifier
outputs = layers.Dense(1,  activation='sigmoid')(x)
model = keras.Model(inputs, outputs)

opt = tf.keras.optimizers.Adam(learning_rate=0.00003)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

file_name = 'weights_{epoch:03d}_{val_accuracy:.4f}.hdf5'

checkpoint_filepath = os.path.join('.', 'SAVE_MODELS', file_name)

modelCheckpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6, restore_best_weights=True)

model.summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, None, 200)]       0         
                                                                 
 bidirectional_25 (Bidirecti  (None, None, 128)        102144    
 onal)                                                           
                                                                 
 bidirectional_26 (Bidirecti  (None, None, 128)        74496     
 onal)                                                           
                                                                 
 bidirectional_27 (Bidirecti  (None, 128)              74496     
 onal)                                                           
                                                                 
 dense_13 (Dense)            (None, 1)                 129       
                                                           

In [126]:
history = model.fit(X_train, train_y, 
                    batch_size=20, 
                    epochs=100, 
                    validation_data=(X_valid,valid_y),
                    callbacks=[earlyStopping,modelCheckpoint]
                   )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 39: early stopping


In [128]:
model.evaluate(X_valid, valid_y)



[0.40956464409828186, 0.8260012865066528]

In [68]:
## Set aside full data set for retraining for competition
## see if training on the full training set improves results

full_x = df_train['text'].map(clean_up_tweet)
full_y = np.array(df_train['target'], dtype=np.float32)
X_full = glove_word_embeddings(full_x)


In [69]:
history = model.fit(X_full, full_y, 
                    batch_size=20, 
                    epochs=3, 
                   )

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [129]:
sub_num = 16

predict_proba = model.predict(X_test,batch_size=20)
predict = (predict_proba > 0.5).astype(int)

submission = pd.DataFrame(df_test['id'])
submission['target']=predict

submission.to_csv(f'data/submission{sub_num}.csv',index=False)
submission

model.evaluate(X_valid, valid_y)



[0.40956464409828186, 0.8260012865066528]