In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
import os

def get_glove_vectors(filename="data/glove.6B.100d.txt"):
    ## function from https://campus.datacamp.com/courses/recurrent-neural-networks-for-language-modeling-in-python/rnn-architecture?ex=7
    # Get all word vectors from pre-trained model
    glove_vector_dict = {}
    with open(filename, encoding="UTF-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = values[1:]
            glove_vector_dict[word] = np.asarray(coefs, dtype='float32')
    return glove_vector_dict

In [2]:
print(tf.__version__)
print(keras.__version__)

2.9.1
2.9.0


In [3]:
import time
start = time.time()

glove_vector_dict = get_glove_vectors()

end = time.time()
print(f'elapsed seconds = {end - start}')
type(glove_vector_dict)

elapsed seconds = 10.228618621826172


dict

In [4]:
glove_vector_dict['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [5]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [6]:
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [7]:
s1 = df_train[df_train['id']==232]
print(s1)
s2 = df_train.iloc[161,3]
s2

      id              keyword   location  \
161  232  airplane%20accident  Havenford   

                                                  text  target  
161  + Nicole Fletcher one of a victim of crashed a...       1  


"+ Nicole Fletcher one of a victim of crashed airplane few times ago. \n\nThe accident left a little bit trauma for her. Although she's \n\n+"

In [8]:
df_train['target'].agg(sum)/len(df_train)

0.4296597924602653

In [9]:
import re

def clean_up_tweet(tweet):
    """
    Clean up the content of one tweet, removing punctuation and numbers. 
    
    Parameters:
    tweet(str):The text of the tweet
    
    Returns:
    word_list: A list of pure alphabetic words in lower case
    
    """
    ## Remove all characters execept alphabetic chars and space,
    ## convert to lower case and split on space.
    word_list = re.sub('[^A-Za-z ]+','',tweet).lower().split(' ')
    return word_list
    

In [10]:
clean_up_tweet(s2)

['',
 'nicole',
 'fletcher',
 'one',
 'of',
 'a',
 'victim',
 'of',
 'crashed',
 'airplane',
 'few',
 'times',
 'ago',
 'the',
 'accident',
 'left',
 'a',
 'little',
 'bit',
 'trauma',
 'for',
 'her',
 'although',
 'shes',
 '']

In [11]:
df_test['text'].head().map(clean_up_tweet)

0            [just, happened, a, terrible, car, crash]
1    [heard, about, earthquake, is, different, citi...
2    [there, is, a, forest, fire, at, spot, pond, g...
3           [apocalypse, lighting, spokane, wildfires]
4    [typhoon, soudelor, kills, , in, china, and, t...
Name: text, dtype: object

In [12]:
df_test['text'].head()

0                   Just happened a terrible car crash
1    Heard about #earthquake is different cities, s...
2    there is a forest fire at spot pond, geese are...
3             Apocalypse lighting. #Spokane #wildfires
4        Typhoon Soudelor kills 28 in China and Taiwan
Name: text, dtype: object

In [13]:
train, valid = train_test_split(df_train, train_size=0.8, shuffle=True, random_state=42)
print(train.shape)
print(valid.shape)

(6090, 5)
(1523, 5)


In [15]:
train_x = train['text'].map(clean_up_tweet)
valid_x = valid['text'].map(clean_up_tweet)
test_x = df_test['text'].map(clean_up_tweet)

print(train_x.shape)
print(valid_x.shape)
print(test_x.shape)
type(train_x)

(6090,)
(1523,)
(3263,)


pandas.core.series.Series

In [18]:
train_y = np.array(train['target'], dtype=np.float32)
valid_y = np.array(valid['target'], dtype=np.float32)
print(train_y[:5])
print(train_y[-5:])
print(np.sum(train_y)/len(train_y))
print(np.sum(valid_y)/len(valid_y))

[1. 0. 1. 1. 0.]
[0. 0. 0. 1. 1.]
0.43054187192118226
0.4261326329612607


In [20]:
train['tweet_word_counts'] = [len(x) for x in train_x]
valid['tweet_word_counts'] = [len(x) for x in valid_x]
df_test['tweet_word_counts'] = [len(x) for x in test_x]
print(np.max(train['tweet_word_counts']) )
print(np.max(valid['tweet_word_counts']) )
print(np.max(df_test['tweet_word_counts']) )

54
30
34


In [21]:
neutral = glove_vector_dict["neutral"]
placeHolder = np.array([neutral])
placeHolder

array([[-0.57835 , -0.079743,  0.23589 ,  0.14232 , -0.74898 ,  0.091366,
        -0.14814 , -0.14615 , -0.68526 ,  0.31882 , -0.56023 , -0.057425,
         0.14159 , -0.072444,  0.61525 , -0.50256 ,  0.42331 , -0.76756 ,
         0.51353 , -0.39777 ,  0.38048 ,  0.4395  ,  0.73211 ,  0.28665 ,
        -0.32091 , -0.52521 , -0.54786 ,  0.31282 , -0.027817,  0.89241 ,
         0.91175 ,  0.13016 , -0.6932  , -0.23235 ,  1.2732  ,  0.033154,
         0.5625  ,  0.26646 , -0.29519 , -1.2666  , -0.029055, -0.31218 ,
        -0.32454 , -0.3499  , -0.015618, -0.39364 , -0.37477 ,  0.27252 ,
        -1.3312  , -1.0447  ,  0.47559 , -0.23485 ,  0.1743  ,  0.68365 ,
        -0.40499 , -1.8036  ,  0.2963  , -0.070282,  1.471   , -0.20166 ,
         0.045613, -0.34433 , -0.32697 ,  0.15731 ,  0.9668  , -0.051295,
        -0.12976 ,  0.55869 , -0.20778 , -0.10335 , -0.015856, -0.78811 ,
         0.24645 ,  0.18674 ,  0.2534  , -0.7372  , -0.14293 , -0.74162 ,
         0.1976  , -0.63874 ,  0.04093

In [55]:
def glove_word_embeddings(word_lists, pad_to=54):
    ## We plan to replace all the words in the tweets
    ## with embeddings from the GloVe dictionary, skipping
    ## any words not found, and also padding the sequence 
    ## of embeddings to a fixed length.
    
    ## If none of the words match for a given tweet we will substitute
    ## a with place holder vector of one word, "neutral".
    d = glove_vector_dict
    neutral = d["neutral"]
    placeHolder = np.array([neutral])
    padNeutral = pad_sequences(placeHolder.T, pad_to, dtype='float32')
    ## print('padNeutral',padNeutral)
    outer = []
    for word_list in word_lists:
        enc_list = []
        for word in word_list:
            ## print(word)
            if(type(d.get(word)) is np.ndarray):
                enc_list.append(d.get(word))
                ## print(d.get(word))
        if(len(enc_list) > 0):
            enc_array = np.array(enc_list)
#             print('shape: ',enc_array.shape)
#             print('enc_array = ',enc_array)
#             print('enc_array.T', enc_array.T)
            pad = pad_sequences(enc_array.T, pad_to, dtype='float32')
#             print('pad',pad)
            outer.append(pad.T)
#             print('outer shape',outer.shape)
#             print('outer', outer)
        else:
            outer.append(padNeutral.T)
    return np.array(outer)

In [57]:
print (train_x.shape)
print (valid_x.shape)
print (test_x.shape)

(6090,)
(1523,)
(3263,)


In [58]:

start = time.time()
X_train = glove_word_embeddings(train_x)
X_valid = glove_word_embeddings(valid_x)
X_test = glove_word_embeddings(test_x)
end = time.time()
print(f'elapsed seconds = {end - start}')
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

elapsed seconds = 2.5990195274353027
(6090, 54, 100)
(1523, 54, 100)
(3263, 54, 100)


In [59]:
X_train[0,53,0:99]

array([ 0.32314  , -0.28602  ,  0.25893  ,  0.47132  , -0.18749  ,
        0.27926  , -0.031222 , -0.36132  ,  0.31671  ,  0.23897  ,
        0.64852  ,  1.1412   ,  0.024164 ,  0.35895  ,  0.32754  ,
       -0.25261  ,  0.50337  , -0.45188  , -1.119    , -0.37694  ,
        1.0946   , -0.99613  , -0.027026 , -0.38558  , -0.2442   ,
        0.6179   , -0.25935  , -0.23036  ,  1.017    , -0.63974  ,
        0.13511  , -0.39997  , -0.33846  , -0.22887  ,  0.43298  ,
        0.12857  ,  0.77761  , -0.70721  ,  0.064632 , -0.60949  ,
       -0.11196  ,  0.32177  ,  1.2263   , -0.14458  ,  0.37544  ,
       -0.48593  ,  0.11136  ,  0.064378 ,  0.15517  , -0.55285  ,
        0.047002 ,  0.19373  ,  0.21567  ,  0.80757  ,  0.22276  ,
       -0.49492  ,  0.24592  , -0.96245  ,  1.3597   ,  0.083068 ,
        0.13025  ,  0.39716  , -0.66914  , -0.76966  ,  0.51586  ,
       -0.68805  , -0.1597   ,  0.89775  , -0.8484   ,  0.16297  ,
       -0.50192  , -0.74355  , -0.36056  , -0.92933  , -0.3715

In [60]:
DROPOUT = 0.2

# Build model
model = Sequential()
model.add(GRU(units=128, input_shape=(None, 100), return_sequences=True, dropout=DROPOUT))
model.add(GRU(units=128, return_sequences=True, dropout=DROPOUT))
model.add(GRU(units=128, return_sequences=False, dropout=DROPOUT))
model.add(Dense(1, activation='sigmoid'))

opt = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

file_name = 'weights_{epoch:03d}_{val_accuracy:.4f}.hdf5'

checkpoint_filepath = os.path.join('.', 'SAVE_MODELS', file_name)

modelCheckpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6, restore_best_weights=True)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_3 (GRU)                 (None, None, 128)         88320     
                                                                 
 gru_4 (GRU)                 (None, None, 128)         99072     
                                                                 
 gru_5 (GRU)                 (None, 128)               99072     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 286,593
Trainable params: 286,593
Non-trainable params: 0
_________________________________________________________________


In [61]:
history = model.fit(X_train, train_y, 
                    batch_size=20, 
                    epochs=100, 
                    validation_data=(X_valid,valid_y),
                    callbacks=[earlyStopping,modelCheckpoint]
                   )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 11: early stopping


In [76]:
## Set aside full data set for retraining for competition
## see if training on the full training set improves results

full_x = df_train['text'].map(clean_up_tweet)
full_y = np.array(df_train['target'], dtype=np.float32)
X_full = glove_word_embeddings(full_x)


In [77]:
history = model.fit(X_full, full_y, 
                    batch_size=20, 
                    epochs=5, 
                   )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [78]:
sub_num = 3

predict_proba = model.predict(X_test,batch_size=20)
predict = (predict_proba > 0.5).astype(int)

# predict2 = np.reshape(predict,len(predict),)
# predict2

submission = pd.DataFrame(df_test['id'])
submission['target']=predict

submission.to_csv(f'data/submission{sub_num}.csv',index=False)
submission



Unnamed: 0,id,target
0,0,1
1,2,1
2,3,0
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
