<a href="https://colab.research.google.com/github/altair08/FYP/blob/main/LSTM_with_and_without_subword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [None]:
# Step 1: Import the required libraries
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Load and preprocess the dataset
dataset = pd.read_csv('/content/drive/MyDrive/Dats/Kaggle/pre_data_train.csv')

Mounted at /content/drive


In [None]:
dataset

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edit make username hardcore metall...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww match background colour im seemingly stic...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really try edit war guy constantly ...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,cant make real suggestion improvement wonder s...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page thats
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,second time ask view completely contradict cov...
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,ashamed horrible thing put talk page
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,spitzer umm there actual article prostitution ...
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,look like actually put speedy first version de...


In [None]:
dataset['preprocessed_text'] = dataset['preprocessed_text'].astype(str)  # Convert comment_text to string
comments = dataset['preprocessed_text'].values
labels = dataset[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

X_train, X_test, y_train, y_test = train_test_split(comments, labels, test_size=0.2, random_state=42)

# LSTM without subword embeddings

In [None]:
# Step 3: Tokenize the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_seq_length = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length)

In [None]:
# Step 4: Create the LSTM model
embedding_dim = 100
lstm_units = 64
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_seq_length))
model.add(LSTM(lstm_units))
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1250, 100)         17427200  
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 6)                 390       
                                                                 
Total params: 17,469,830
Trainable params: 17,469,830
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Step 5: Train the LSTM model
batch_size = 32
epochs = 1
model.fit(X_train_pad, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test_pad, y_test))

 114/3990 [..............................] - ETA: 1:40:30 - loss: 0.2106 - accuracy: 0.6976

KeyboardInterrupt: ignored

In [None]:
model.save('/content/drive/MyDrive/LSTM_without_subword')

In [None]:
# Display test data in dataset format
test_data = pd.DataFrame({'preprocessed_text': X_test, 'label': y_test.tolist()})
test_data.to_csv('/content/drive/MyDrive/predict_lstm_with_subword.csv', index=False)

In [None]:
test_data

In [None]:
# Step 6: Make predictions and evaluate the model
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)
f1_without_subword = f1_score(y_test, y_pred, average='micro')
accuracy_without_subword = accuracy_score(y_test, y_pred)

# Print the F1 score and accuracy
print("F1 score (without subword embeddings):", f1_without_subword)*100
print("Accuracy (without subword embeddings):", accuracy_without_subword)*100

# LSTM with subword embeddings

In [None]:
# Step 3: Tokenize the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_seq_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length)

In [None]:
# Step 4: Create the LSTM model
embedding_dim = 100
lstm_units = 64
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_seq_length))
model.add(LSTM(lstm_units))
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
print(model.summary())

In [None]:
# Step 5: Train the LSTM model
batch_size = 32
epochs = 1
model.fit(X_train_pad, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test_pad, y_test))

In [None]:
model.save('/content/drive/MyDrive/LSTM_with_subword')

In [None]:
# Display test data in dataset format
test_data = pd.DataFrame({'preprocessed_text': X_test, 'label': y_test.tolist()})
test_data.to_csv('/content/drive/MyDrive/predict_lstm_without_subword.csv', index=False)

In [None]:
test_data

In [None]:
# Step 6: Make predictions and evaluate the model
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)
f1_with_subword = f1_score(y_test, y_pred, average='micro')
accuracy_with_subword = accuracy_score(y_test, y_pred)

# Print the F1 score and accuracy
print("F1 score (with subword embeddings):", f1_with_subword)*100
print("Accuracy (with subword embeddings):", accuracy_with_subword)*100
