# **Git-Hub URL**

In [None]:
# https://github.com/aamemara/nlp-disaster-tweets.git

#**Imports**

In [38]:
from google.colab import drive
import pandas as pd
import numpy as np
import os
import string
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.layers import SpatialDropout1D, Embedding, BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, LSTM, Activation
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#**EDA**

In [2]:
# Mount drive to load files
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
drive_path = "drive/MyDrive/Colab Notebooks/University of Colorado Boulder:  Introduction to Deep Learning/"
print(os.listdir(drive_path+"nlp-getting-started"))
df_train_val = pd.read_csv(drive_path+"nlp-getting-started/train.csv")
df_X_train_val = df_train_val.drop(columns=['target'])
df_y_train_val = df_train_val[['id', 'target']]
df_X_test = pd.read_csv(drive_path+"nlp-getting-started/test.csv")
df_y_test = pd.read_csv(drive_path+"nlp-getting-started/sample_submission.csv")
print(df_X_train_val.info(),df_y_train_val.info(),df_X_test.info(),df_y_test.info())
# Get uniques classes
labels=df_train_val['target'].unique()
print(labels)

['train.csv', 'sample_submission.csv', 'test.csv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
dtypes: int64(1), object(3)
memory usage: 238.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      7613 non-null   int64
 1   target  7613 non-null   int64
dtypes: int64(2)
memory usage: 119.1 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158

In [40]:
df_X_train_val['num_words'] = df_X_train_val['text'].apply(lambda x: len(x.split()))
# Identify outliers based on chosen criterion
Q1 = df_X_train_val['num_words'].quantile(0.25)
Q3 = df_X_train_val['num_words'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
nltk.download('stopwords')
nltk.download('punkt')
def clean_text(text, max_words=int(upper_bound)):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespaces
    text = ' '.join(text.split())
    # Tokenize the text into words
    words = word_tokenize(text)
    # Get the list of English stop words
    stop_words = set(stopwords.words('english'))
    # Filter out stop words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the filtered words back into a sentence
    text = ' '.join(filtered_words)
    # Limit the number of words
    text = ' '.join(text.split()[:max_words])
    return text

# Clean train and test data
df_X_train_val['clean text']=df_X_train_val['text'].apply(clean_text)
df_X_test['clean text']=df_X_test['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [41]:
X_train, X_val, y_train, y_val = train_test_split(df_X_train_val['clean text'], df_y_train_val['target'], test_size=0.2, random_state=42)

In [42]:
# Tokenize the text data to compute vocabulary size and maximum sequence length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_X_train_val['clean text'])
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_train = pad_sequences(X_train, maxlen=int(upper_bound), padding='post')
X_val = pad_sequences(X_val, maxlen=int(upper_bound), padding='post')
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token
embedding_dim = 100

In [52]:
X_test = tokenizer.texts_to_sequences(df_X_test['clean text'])
X_test = pad_sequences(X_test, maxlen=int(upper_bound), padding='post')

#**Model building and training**

In [46]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=int(upper_bound)))
#model.add(SpatialDropout1D(0.5))
model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics='accuracy')

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 31, 100)           2256400   
                                                                 
 lstm_3 (LSTM)               (None, 128)               117248    
                                                                 
 batch_normalization_3 (Bat  (None, 128)               512       
 chNormalization)                                                
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                      

In [51]:
model.fit(X_train, y_train, epochs = 10, verbose=1, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x786020bb1a80>

#**Results**

In [53]:
y_pred_proba = model.predict(X_test)
# Convert probabilities to binary class labels (0 or 1) based on a threshold
y_pred = (y_pred_proba > 0.5).astype(int)
df_y_test['target'] = y_pred
df_y_test.to_csv(drive_path+"nlp-getting-started/my_sample_submission.csv", index=False)

