# NLP (seq2seq)

Dataset : https://kaggle.com/c/nlp-getting-started

In [1]:
! pip install kaggle



In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle competitions download nlp-getting-started

Downloading nlp-getting-started.zip to /content
100% 593k/593k [00:00<00:00, 1.02MB/s]
100% 593k/593k [00:00<00:00, 1.02MB/s]


In [6]:
! unzip /content/nlp-getting-started.zip

Archive:  /content/nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [8]:
data = pd.read_csv("/content/train.csv")

In [9]:
data.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [10]:
df = data.sample(frac=1).reset_index(drop=True)

In [11]:
df.head(4)

Unnamed: 0,id,keyword,location,text,target
0,5822,hail,,@Flow397 Coming atcha from Boston. Had golfbal...,0
1,1346,blown%20up,The Grey Area,On #ThisDayInHistory in 1862 Confederate ship ...,1
2,983,blazing,"Intramuros, Manila",Come and join us Tomorrow!\nAugust 7 2015 at T...,0
3,8570,screams,,IS THE UPDATE RLY LIFE NOW IS IT IS It/Screams...,0


In [12]:
len(data['text'])

7613

In [13]:
X = df['text'].to_numpy()
y = df['target'].to_numpy()

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101, test_size=0.2)

## Converting text to numeric form
>Method 1 : Tokenization

In [15]:
# tf.keras.layers.TextVectorization(
#     max_tokens=None, # maximum  cap of number of words in vocab, for example if 100, only most common 100 words will be in our vocab, if the word is not one of those 100 it will be represented as <OOV>
#     standardize='lower_and_strip_punctuation', # set all letters to lower case and only keep texts
#     split='whitespace', # split sequences by white spaces in text
#     ngrams=None, # groups words together, for example if 2, it will groups each two words together
#     output_mode='int',
#     output_sequence_length=None, # something like batch in images, if None it will feed the nn by length of longest text (for shorter text it will just put zeros at remaining spaces)
#     pad_to_max_tokens=True,
# )

In [16]:
MAX_VOCAB = 10000 # only 10,000 most common words
MAX_LENGTH = 20 # set maximum length 20

In [17]:
text_vec = tf.keras.layers.TextVectorization(
    max_tokens=MAX_VOCAB,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=MAX_LENGTH,
    pad_to_max_tokens=True,
)

In [18]:
text_vec.adapt(X_train) # training our text vectrozier on data

In [19]:
text_vec('hello world')

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([1388,   96,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])>

In [20]:
vocab_words = text_vec.get_vocabulary()

In [21]:
vocab_words[:10] #top ten words

['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is']

[UNK] : a words outside of our vocab

## Method 2 : Embedding

In [22]:
embedding_layer = tf.keras.layers.Embedding(
    input_dim=MAX_VOCAB, # vocab size just like Textvectorization
    output_dim=64, # size of vector (better to be divisible by 8)
    input_length=MAX_LENGTH # max length of each sequeance
)

In [23]:
embedding_layer(text_vec('Hello world'))

<tf.Tensor: shape=(20, 64), dtype=float32, numpy=
array([[-0.04146231, -0.03949057,  0.02949816, ..., -0.03194463,
        -0.02187135,  0.0457492 ],
       [-0.01423581,  0.00380057,  0.0340062 , ..., -0.00754116,
        -0.03079898,  0.00986496],
       [-0.00197541, -0.02249515,  0.04336886, ..., -0.02428993,
        -0.00752236, -0.04270637],
       ...,
       [-0.00197541, -0.02249515,  0.04336886, ..., -0.02428993,
        -0.00752236, -0.04270637],
       [-0.00197541, -0.02249515,  0.04336886, ..., -0.02428993,
        -0.00752236, -0.04270637],
       [-0.00197541, -0.02249515,  0.04336886, ..., -0.02428993,
        -0.00752236, -0.04270637]], dtype=float32)>

## Model 0 (ML): Naive Bayes
> We'll use a scikit learn (Machine learning) model for our baseline model

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# creating tokenization and modeling pipeline

model_0 = Pipeline([ #this pipline from sklearn is something like Sequential in Tensorflow
    ('tfidf', TfidfVectorizer()), # converts texts to numbers
    ('classifier', MultinomialNB()) # our model
])

# fitting our model the data
model_0.fit(X_train, y_train)

In [25]:
model_0_score = model_0.score(X_test, y_test)
model_0_score

0.8082731451083388

## Model 1 (FFN): Simple Neural Network

In [26]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string) # shape is (1,) because it's going to see one sequence at time
x = text_vec(inputs) # this layer converts our text to numeric values
x = embedding_layer(x) # now passing the numeric values to our embedding layer
x = tf.keras.layers.GlobalAveragePooling1D()(x) # we need to put this layer to match shapes between output layer and the embedding layer
outputs = tf.keras.layers.Dense(units=1, activation=tf.keras.activations.sigmoid) (x) #output layer
model_1 = tf.keras.Model(inputs, outputs)

In [27]:
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [28]:
model_1.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ad52255f7f0>

In [29]:
model_1_score = model_1.evaluate(X_test, y_test)
model_1_score



[0.4377887547016144, 0.8056467771530151]

## Model 2 (RNN) : Long Short Term Memory (LSTM)

In [30]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vec(inputs)
x = embedding_layer(x)

# LSTM layer with return_sequences=True:
# We set return_sequences=True here because we want to return the full sequence of hidden states
# rather than just the final hidden state. This is important when stacking multiple LSTM layers
# as we need to pass the entire sequence of hidden states to the next LSTM layer.
# in other word this will keep that extra dimension
x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x)

# Second LSTM layer without return_sequences:
# We don't need to return the full sequence from this layer,
# as we're only interested in the final output for classification.
# in other word this will drop that extra dimension like Global average pooling
x = tf.keras.layers.LSTM(units=64)(x)

x = tf.keras.layers.Dense(units=64, activation='relu')(x)
outputs = tf.keras.layers.Dense(units=1, activation=tf.keras.activations.sigmoid)(x)

model_2 = tf.keras.Model(inputs, outputs)

In [31]:
model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [32]:
model_2.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
model_2_score = model_2.evaluate(X_test, y_test)
model_2_score



[0.8263959884643555, 0.7866053581237793]

# Model 3 (RNN) : GRU (Gated Recurrent Unit)

In [34]:
inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vec(inputs)
x = embedding_layer(x)
x = tf.keras.layers.GRU(units=64, return_sequences=True)(x)
x = tf.keras.layers.GRU(units=64)(x) # can put a LSTM here instead of GRU
x = tf.keras.layers.Dense(units=128, activation='relu')(x)
outputs = tf.keras.layers.Dense(units=1, activation='sigmoid')(x)

model_3 = tf.keras.Model(inputs, outputs)

In [35]:
model_3.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [36]:
model_3.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ad52141a770>

In [37]:
model_3_score = model_3.evaluate(X_test, y_test)
model_3_score



[1.1598756313323975, 0.7806959748268127]

## Model 4 (RNN) : Bidirectional RNN

In [38]:
inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vec(inputs)
x = embedding_layer(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=64, return_sequences=True))(x) # we pass a layer to bidirectional layer
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64))(x)
outputs = tf.keras.layers.Dense(units=1, activation='sigmoid')(x)

model_4 = tf.keras.Model(inputs, outputs)

In [39]:
model_4.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [40]:
model_4.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ad512f2e650>

In [41]:
model_4_score = model_4.evaluate(X_test, y_test)
model_4_score



[1.500759243965149, 0.7721602320671082]

## Model 5 (1D CNN) : Using 1D CNN for sequence

In [42]:
inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vec(inputs)
x = embedding_layer(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=3, strides=1, activation='relu', padding='same')(x)
x = tf.keras.layers.GlobalMaxPool1D()(x)
outputs = tf.keras.layers.Dense(units=1, activation='sigmoid')(x)

model_5 = tf.keras.Model(inputs, outputs)

In [43]:
model_5.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [44]:
model_5.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ad5126c9930>

In [45]:
model_5_score = model_5.evaluate(X_test, y_test)
model_5_score



[1.193436622619629, 0.7518056631088257]

## Model 6 : Transfer learning (Feature Extraction)
> USE Model : Universal Sentence Encoder

In [46]:
import tensorflow_hub as hub

In [47]:
use_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2",
                           input_shape=[], # we leave it empty because it can be variable length
                           dtype=tf.string,
                           trainable=False)

In [48]:
model_6 = tf.keras.Sequential([
    use_layer,
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
], name='Transfer_learning_Model')

In [49]:
model_6.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [50]:
model_6.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ad5126c9a80>

In [51]:
model_6_score = model_6.evaluate(X_test, y_test)
model_6_score



[0.4073680341243744, 0.818122148513794]

In [52]:
model_6_score[1]

0.818122148513794

In [53]:
model_scores = {
    'ML Model': model_0_score,
    'ANN Model': model_1_score,
    'LSTM Model': model_2_score,
    'GRU Model': model_3_score,
    'Bidirectional Model': model_4_score,
    '1D CNN Model': model_5_score,
    'Transfer Learning Model': model_6_score
}

df_scores = pd.DataFrame(model_scores)
df_scores = df_scores.iloc[1]
df_scores =  pd.DataFrame(df_scores).T
df_scores

Unnamed: 0,ML Model,ANN Model,LSTM Model,GRU Model,Bidirectional Model,1D CNN Model,Transfer Learning Model
1,0.808273,0.805647,0.786605,0.780696,0.77216,0.751806,0.818122
