# Bidirectional LSTM

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('train.csv', sep=';',engine='python', on_bad_lines='skip')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


In [4]:
df.shape

(24353, 4)

In [5]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [6]:
# Indepedent feature
X = df.drop('label', axis=1)

In [7]:
# Depedent feature
y = df['label']

In [8]:
X.shape, y.shape

((24353, 3), (24353,))

In [9]:
y.value_counts()

label
1    13246
0    11107
Name: count, dtype: int64

In [10]:
import tensorflow as tf

2025-05-12 09:51:38.437802: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [12]:
# Vocabulary Size
voc_size = 10000

In [13]:
messages = X.copy()
messages

Unnamed: 0.1,Unnamed: 0,title,text
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s..."
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...
2,2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...
...,...,...,...
24348,24348,Mexico Senate committee OK's air transport dea...,MEXICO CITY (Reuters) - A key committee in Mex...
24349,24349,BREAKING: HILLARY CLINTON’S STATE DEPARTMENT G...,IF SHE S NOT TOAST NOW THEN WE RE IN BIGGER TR...
24350,24350,trump breaks from stump speech to admire beaut...,kremlin nato was created for agression \nruss...
24351,24351,NFL PLAYER Delivers Courageous Message: Stop B...,Dallas Cowboys star wide receiver Dez Bryant t...


In [14]:
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Data Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [16]:
corpus = []

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', '  ', messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [17]:
corpus[1]

'china say trump call taiwan presid chang island statu'

In [18]:
# One Hot Representation

onehot_repr = [one_hot(word,voc_size) for word in corpus]
onehot_repr[1]

[8533, 7592, 6614, 2676, 6473, 4175, 1336, 609, 7711]

In [19]:
# Checking Sent length in title column

from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [20]:
length = [len(word) for word in corpus]

max_index = length.index(max(length))

print(f"Length of longest index : {length[max_index]}")
print(f"Longest Sentence : {corpus[max_index]}")

Length of longest index : 298
Longest Sentence : roger stone updat stop steal exit poll program globalist establish creat fals expect close race use fake poll rig vote machin meet expect nationalist constitut freemarket awaken rothschild soro hillari chanc steal us elect shrink day whole system rig expos american peopl fed elitist deviant panick


In [21]:
# Embedding Representation (Padding)
sent_length = 300
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
embedded_docs[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [22]:
# Creating Model
embedding_vector_feature = 300  # Dimensions
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_feature))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1, activation='sigmoid'))
model.build(input_shape=(None, sent_length))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         3000000   
                                                                 
 bidirectional (Bidirection  (None, 200)               320800    
 al)                                                             
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 3321001 (12.67 MB)
Trainable params: 3321001 (12.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
import numpy as np

X_final = np.array(embedded_docs)
y_final = np.array(y)

In [27]:
X_final.shape , y_final.shape

((24353, 300), (24353,))

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state =42)

In [29]:
# Model training

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size = 64)

Epoch 1/10

KeyboardInterrupt: 