<a href="https://colab.research.google.com/github/altair08/FYP/blob/main/LSTM_without_subword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np

In [43]:
# Load and preprocess the data
dataset = pd.read_csv('/content/drive/MyDrive/Dats/Kaggle/pre_data_train.csv')

from sklearn.model_selection import train_test_split

dataset, test = train_test_split(dataset, test_size=0.2)


In [44]:
dataset.isnull().sum()

id                    0
comment_text          0
toxic                 0
severe_toxic          0
obscene               0
threat                0
insult                0
identity_hate         0
preprocessed_text    42
dtype: int64

In [45]:
sentiment = dataset['preprocessed_text'].values
y_list = ["toxic","severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = dataset[y_list].values
sentiment

array(['fictional character category please stop remove character specifically refer lord lady categoryfictional lord lady delete perfectly relevant category without discussion whatsoever go fictional duke duchess edit blatant vandalism really quite tiresome go around revert im important figure wikipedia good thing time furthermore kindly refrain nominate every single fictional character category deletion read essay write fictional character category find quite frankly stupifying vagueness leave comment talk page youre free take look',
       'please revert hello mion think confuse regard last edit bioconversion biomass mix alcohol fuel please check link notice link basically simply dr holtzapples presentation already cite article reference present version rv dr holtzapple simply speak slide show notice already video clip dr holtzapple give presentation reference present version clip present lot thing whereas one add concentrate bioconversion biomass mix alcohol fuel exactly make think

In [46]:
predict_data = test['preprocessed_text'].values

In [47]:
predict_data

array(['well know journalist valid source cnn huffington post etc',
       'hello see list categorywikipedians germany make subcategories bundesland case would like add appropriate one see categorywikipedians germany list subcategories use english name wikipedia article tk',
       'pure tripe steal bio official website outdated way thats bad wiki practice',
       ...,
       'httpwwwmediamonitorsnetednahtml httpwwwfreerepubliccomforumaadbdhtm',
       'dyk adrian thomas dyk scene crime dyk mississippi baby tmbox type notice image text july know update fact article mississippi baby recently create substantially expand fact hiv virus reappear mississippi baby think cure welcome check many page hit article get front page live view daily total may add statistic page total finally know interest fact another recently create article please feel free sugges',
       'oh sorry think true address e one'], dtype=object)

In [48]:
# Convert float values to strings
sentiment = np.array(sentiment, dtype=str)
predict_data = np.array(predict_data, dtype=str)

# Tokenize the data
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(sentiment))
seq = tokenizer.texts_to_sequences(sentiment)
pad = pad_sequences(seq, maxlen=100)
test_seq = tokenizer.texts_to_sequences(predict_data)
test_pad = pad_sequences(test_seq, maxlen=100)

In [49]:
from keras.layers import Reshape

# Define the model architecture without subword embeddings
def model_without_subword_embedding():
    inputs = Input(shape=(100,))
    x = Reshape((100, 1))(inputs)
    x = LSTM(50)(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = model_without_subword_embedding()
print(model.summary())

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 100)]             0         
                                                                 
 reshape_4 (Reshape)         (None, 100, 1)            0         
                                                                 
 lstm_4 (LSTM)               (None, 50)                10400     
                                                                 
 dropout_8 (Dropout)         (None, 50)                0         
                                                                 
 dense_8 (Dense)             (None, 50)                2550      
                                                                 
 dropout_9 (Dropout)         (None, 50)                0         
                                                                 
 dense_9 (Dense)             (None, 6)                 306 

In [50]:
# Train the model without subword embeddings
model_without_subword = model_without_subword_embedding()
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
model_without_subword.fit(pad, y, batch_size=32, epochs=4, validation_split=0.1, callbacks=[early])


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fd7f3c0f100>

In [51]:
model.save('/content/drive/MyDrive/LSTM_without_subword')



In [61]:
# Make predictions on the test data without subword embeddings
y_test_without_subword = model_without_subword.predict([test_pad], batch_size=1024, verbose=1)

# Apply threshold and convert to 0 or 1
threshold = 0.18  # Adjust this threshold as needed
y_test_without_subword[y_test_without_subword >= threshold] = 1
y_test_without_subword[y_test_without_subword < threshold] = 0
y_test_without_subword = y_test_without_subword.astype(int)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_labels_df_without_subword = pd.DataFrame(y_test_without_subword, columns=y_list)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_data_df = pd.DataFrame(predict_data, columns=["Title"])

# Combine the original DataFrame with the predicted labels without subword embeddings
predict_df_without_subword = pd.concat([predict_data_df, predict_labels_df_without_subword], axis=1)

# Save the DataFrame to a CSV file
predict_df_without_subword.to_csv('/content/drive/MyDrive/predict_lstm_without_subword.csv', index=False)





In [62]:
predict_df_without_subword

Unnamed: 0,Title,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,well know journalist valid source cnn huffingt...,0,0,0,0,0,0
1,hello see list categorywikipedians germany mak...,0,0,0,0,0,0
2,pure tripe steal bio official website outdated...,0,0,0,0,0,0
3,horse erect penis,1,0,0,0,0,0
4,auto archive noticebotlowercase sigmabot iiiag...,1,0,0,0,0,0
...,...,...,...,...,...,...,...
31910,update reformat ive do lot rework article incl...,0,0,0,0,0,0
31911,hi finish read message say anything two thing ...,0,0,0,0,0,0
31912,httpwwwmediamonitorsnetednahtml httpwwwfreerep...,0,0,0,0,0,0
31913,dyk adrian thomas dyk scene crime dyk mississi...,0,0,0,0,0,0


In [63]:
from sklearn.metrics import accuracy_score, f1_score, hamming_loss
import numpy as np

# Assuming you have the true labels for the test data
y_test_true = test[y_list].values

# Apply threshold to convert to binary values
y_test_true = np.where(y_test_true >= threshold, 1, 0)

# Calculate accuracy and F1 score for the model without subword embeddings
accuracy_without_subword = accuracy_score(y_test_true, y_test_without_subword)*100
f1_score_without_subword = f1_score(y_test_true, y_test_without_subword, average='micro')*100

# Print accuracy and F1 score
print("Accuracy (without subword embeddings):", accuracy_without_subword)
print("F1 Score (without subword embeddings):", f1_score_without_subword)


Accuracy (without subword embeddings): 80.9650634497885
F1 Score (without subword embeddings): 13.92671113954126
