In [317]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.examples import sentences 
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
import gensim
import gensim.downloader as api
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import LSTM, Bidirectional

In [194]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [88]:
nlp = spacy.load("en_core_web_sm")

In [69]:
wv = api.load("word2vec-google-news-300")

# Preprocess the text

In [None]:
def preprocess(text):

    text = re.sub(r'http\S+|www\S+|https\S+|[^a-zA-Z0-9\s]', '', text) #remove urls and special characters
    text = re.sub(r'\s+', ' ', text) #remove extra spaces


    doc = nlp(text)
    filtered_tokens = []

    for token in doc:
        filtered_tokens.append(token.text)
        
    return " ".join(filtered_tokens) #return a processed sentence

In [198]:
df_train['simple_processed_text'] = df_train['text'].apply(preprocess)

df_train['processed_text'] = df_train['simple_processed_text'].apply(gensim.utils.simple_preprocess)

In [401]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     7613 non-null   int64 
 1   keyword                7552 non-null   object
 2   location               5080 non-null   object
 3   text                   7613 non-null   object
 4   target                 7613 non-null   int64 
 5   simple_processed_text  7613 non-null   object
 6   processed_text         7613 non-null   object
dtypes: int64(2), object(5)
memory usage: 416.5+ KB


In [405]:
#Remove duplicated tweets (some of them had different targets)

df_train = df_train[~df_train.duplicated(subset='text', keep=False)]

In [406]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7434 entries, 0 to 7612
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     7434 non-null   int64 
 1   keyword                7378 non-null   object
 2   location               4982 non-null   object
 3   text                   7434 non-null   object
 4   target                 7434 non-null   int64 
 5   simple_processed_text  7434 non-null   object
 6   processed_text         7434 non-null   object
dtypes: int64(2), object(5)
memory usage: 464.6+ KB


In [407]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target,simple_processed_text,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[all, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[people, receive, wildfires, evacuation, order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[just, got, sent, this, photo, from, ruby, ala..."


## Tune the model

In [408]:
# Use the Tokenizer to find the vocabulary size of all the data

texts_list = df_train['processed_text'].tolist() #put all the processed tweets in a list

tokenizer = Tokenizer(oov_token="<OOV>") #create an instance of the Tokenizer class
tokenizer.fit_on_texts(texts_list) #fit the tokenizer on the processed tweets

vocab_size = len(tokenizer.word_index) + 1 #adding one for the padding token

print(f'Vocabulary size: {vocab_size}')

Vocabulary size: 16500


In [409]:
# Padding all the vectors to have the same dimension

sequences = tokenizer.texts_to_sequences(texts_list) #maps each word to an integer
max_len_list = [len(seq) for seq in sequences] #create a list with the dimensions of all the tweets
max_len = max(max_len_list) #find the tweet with most words

padded_encoded_text = pad_sequences(sequences, maxlen=max_len, padding='post') #all the vectors will have size 30 (max_len) with zeros at the end (pading)

In [410]:
# Create the Embedding Matrix

embedding_dim = 300 #word2vec google news has 300 domensions
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in wv:
        embedding_matrix[i] = wv[word]
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,)) #if the word isn't in the wv vocab, it will be initialized with random values

In [471]:
# Create the model

model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    trainable=False)) # We can test with different values for the output_dim
#model.add(Flatten())
#model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(0.5))
#model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
#model.add(Dropout(0.5))
model.add(GlobalMaxPooling1D())
#model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
#model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

In [472]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [473]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [475]:
model.fit(padded_encoded_text, df_train['target'], epochs=10, callbacks=[early_stopping])

Epoch 1/10
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8761 - loss: 0.3133
Epoch 2/10
[1m  9/233[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 14ms/step - accuracy: 0.9248 - loss: 0.2366

  current = self.get_monitor_value(logs)


[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9008 - loss: 0.2663
Epoch 3/10
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9131 - loss: 0.2348
Epoch 4/10
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9275 - loss: 0.1963
Epoch 5/10
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.9257 - loss: 0.1827
Epoch 6/10
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.9404 - loss: 0.1636
Epoch 7/10
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9470 - loss: 0.1461
Epoch 8/10
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9569 - loss: 0.1229
Epoch 9/10
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9491 - loss: 0.1206
Epoch 10/10
[1m233/233[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2347520b350>

In [476]:
def make_predictions(df, output='cnn_submission_file.csv'):

    ids = df['id'].values

    df['simple_processed_text'] = df['text'].apply(preprocess) #remove websites
    df['processed_text'] = df['simple_processed_text'].apply(gensim.utils.simple_preprocess) #preprocess using gensim function "simple_preprocess"

    tweets_list = df['processed_text'].tolist() #put all the processed tweets in a list

    sequences_tweets = tokenizer.texts_to_sequences(tweets_list) #maps each word to an integer
    padded_encoded_tweets = pad_sequences(sequences_tweets, maxlen= max_len, padding='post') #all the vectors will have size 25 with zeros at the end (pading)

    predictions = model.predict(padded_encoded_tweets) # make the predicitons
    predictions = (predictions.flatten() > 0.5).astype(int) #round the predictions to be 0 or 1 and ensure the 1D array format to be able to add to the dataframe

    df_predictions = pd.DataFrame({
        'id': ids,
        'target': predictions
    })

    df_predictions.to_csv(output, index=False)

    return df_predictions

In [477]:
make_predictions(df_test)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
