<a href="https://colab.research.google.com/github/altair08/FYP/blob/main/LSTM_with_and_without_subword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [91]:
# Step 1: Import the required libraries
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Load and preprocess the dataset
dataset = pd.read_csv('/content/drive/MyDrive/Dats/Kaggle/pre_data_train.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [92]:
dataset

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edit make username hardcore metall...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww match background colour im seemingly stic...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really try edit war guy constantly ...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,cant make real suggestion improvement wonder s...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page thats
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,second time ask view completely contradict cov...
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,ashamed horrible thing put talk page
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,spitzer umm there actual article prostitution ...
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,look like actually put speedy first version de...


In [93]:
dataset['preprocessed_text'] = dataset['preprocessed_text'].astype(str)  # Convert comment_text to string
comments = dataset['preprocessed_text'].values
labels = dataset[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

X_train, X_test, y_train, y_test = train_test_split(comments, labels, test_size=0.2, random_state=42)

# LSTM without subword embeddings

In [94]:
# Step 3: Tokenize the text data
max_words = 20000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_seq_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length)

In [95]:
# Step 4: Create the LSTM model
embedding_dim = 100
lstm_units = 50
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_seq_length))
model.add(LSTM(lstm_units))
model.add(Dropout(0.1))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [96]:
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 100)          17427200  
                                                                 
 lstm_7 (LSTM)               (None, 50)                30200     
                                                                 
 dropout_3 (Dropout)         (None, 50)                0         
                                                                 
 dense_9 (Dense)             (None, 50)                2550      
                                                                 
 dropout_4 (Dropout)         (None, 50)                0         
                                                                 
 dense_10 (Dense)            (None, 6)                 306       
                                                                 
Total params: 17,460,256
Trainable params: 17,460,256


In [97]:
# Step 5: Train the LSTM model
batch_size = 32
epochs = 1
model.fit(X_train_pad, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test_pad, y_test))



<keras.callbacks.History at 0x7f87f43d2350>

In [98]:
model.save('/content/drive/MyDrive/LSTM_without_subword')



In [99]:
# Display test data in dataset format
test_data = pd.DataFrame({'preprocessed_text': X_test, 'label': y_test.tolist()})
test_data.to_csv('/content/drive/MyDrive/predict_lstm_without_subword.csv', index=False)

In [100]:
test_data

Unnamed: 0,preprocessed_text,label
0,geez forgetful weve already discus marx anarch...,"[0, 0, 0, 0, 0, 0]"
1,carioca rfa thank support request adminship fi...,"[0, 0, 0, 0, 0, 0]"
2,birthday worry enjoy ur daytalke,"[0, 0, 0, 0, 0, 0]"
3,pseudoscience category im assume article pseud...,"[0, 0, 0, 0, 0, 0]"
4,phrase exist would provide search engine even ...,"[0, 0, 0, 0, 0, 0]"
...,...,...
31910,would claim part ignorant majority,"[0, 0, 0, 0, 0, 0]"
31911,lyric find german version assume usable,"[0, 0, 0, 0, 0, 0]"
31912,encyclopedia titanica reference source origina...,"[0, 0, 0, 0, 0, 0]"
31913,silly fat cow wont leave alone,"[1, 0, 1, 0, 1, 0]"


In [103]:
# Step 6: Make predictions and evaluate the model
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.8).astype(int)
f1_without_subword = f1_score(y_test, y_pred, average='micro')*100
accuracy_without_subword = accuracy_score(y_test, y_pred)*100

# Print the F1 score and accuracy
print("F1 score (without subword embeddings):", f1_without_subword)
print("Accuracy (without subword embeddings):", accuracy_without_subword)

F1 score (without subword embeddings): 53.372732830650094
Accuracy (without subword embeddings): 90.90709697634341


# LSTM with byte pair encoding (BPE)

In [13]:
!pip install bpemb




In [14]:
from bpemb import BPEmb

In [15]:
# Step 4: Create the BytePair Encoding tokenizer
tokenizer = BPEmb(lang="en", vs=10000, dim=100)

In [16]:
# Step 5: Tokenize the text data using BPEmb
X_train_tokens = tokenizer.encode(X_train)
X_test_tokens = tokenizer.encode(X_test)

In [17]:
# Step 6: Pad the tokenized sequences
max_seq_length = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length, padding='post')

In [19]:
# Step 7: Create the LSTM model with subword embeddings
embedding_dim = 100
lstm_units = 64
vocab_size = tokenizer.vocab_size
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_seq_length))
model.add(LSTM(lstm_units))
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1250, 100)         1000000   
                                                                 
 lstm_1 (LSTM)               (None, 64)                42240     
                                                                 
 dense_1 (Dense)             (None, 6)                 390       
                                                                 
Total params: 1,042,630
Trainable params: 1,042,630
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
# Step 8: Train the LSTM model
batch_size = 32
epochs = 1
model.fit(X_train_pad, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test_pad, y_test))



<keras.callbacks.History at 0x7f87b05ade40>

In [22]:
model.save('/content/drive/MyDrive/LSTM_with_BPE')



In [54]:
# Display test data in dataset format
test_data = pd.DataFrame({'preprocessed_text': X_test, 'label': y_test.tolist()})
test_data.to_csv('/content/drive/MyDrive/predict_lstm_with_BPE.csv', index=False)

In [24]:
test_data

Unnamed: 0,preprocessed_text,label
0,geez forgetful weve already discus marx anarch...,"[0, 0, 0, 0, 0, 0]"
1,carioca rfa thank support request adminship fi...,"[0, 0, 0, 0, 0, 0]"
2,birthday worry enjoy ur daytalke,"[0, 0, 0, 0, 0, 0]"
3,pseudoscience category im assume article pseud...,"[0, 0, 0, 0, 0, 0]"
4,phrase exist would provide search engine even ...,"[0, 0, 0, 0, 0, 0]"
...,...,...
31910,would claim part ignorant majority,"[0, 0, 0, 0, 0, 0]"
31911,lyric find german version assume usable,"[0, 0, 0, 0, 0, 0]"
31912,encyclopedia titanica reference source origina...,"[0, 0, 0, 0, 0, 0]"
31913,silly fat cow wont leave alone,"[1, 0, 1, 0, 1, 0]"


In [28]:
# Step 6: Make predictions and evaluate the model
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)
f1_with_subword = f1_score(y_test, y_pred, average='micro')*100
accuracy_with_subword = accuracy_score(y_test, y_pred)*100

# Print the F1 score and accuracy
print("F1 score (with BPE):", f1_with_subword)
print("Accuracy (with BPE):", accuracy_with_subword)


F1 score (with subword embeddings): 0.0
Accuracy (with subword embeddings): 89.83550054833151


## LSTM using Bidirectional

In [85]:
from tensorflow.keras.layers import Bidirectional

In [86]:
# Step 3: Tokenize the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_seq_length = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length)

In [87]:
# Step 4: Create the LSTM model with bidirectional layer
embedding_dim = 100
lstm_units = 64
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_seq_length))
model.add(Bidirectional(LSTM(lstm_units)))
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [88]:
# Step 7: Train the LSTM model
batch_size = 32
epochs = 1
model.fit(X_train_pad, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test_pad, y_test))

model.save('/content/drive/MyDrive/LSTM_with_bidirectional')





In [89]:
# Display test data in dataset format
test_data = pd.DataFrame({'preprocessed_text': X_test, 'label': y_test.tolist()})
test_data.to_csv('/content/drive/MyDrive/predict_lstm_with_bidirectional.csv', index=False)

In [90]:
# Step 8: Make predictions and evaluate the model
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)
f1_with_subword = f1_score(y_test, y_pred, average='micro')*100
accuracy_with_subword = accuracy_score(y_test, y_pred)*100

# Print the F1 score and accuracy
print("F1 score (with bidirectional):", f1_with_subword)
print("Accuracy (with bidirectionals):", accuracy_with_subword)

F1 score (with bidirectional): 69.45895836844767
Accuracy (with bidirectionals): 91.8188939370202
