<a href="https://colab.research.google.com/github/ajaykumarmehra/Toxic-Comment-Detection-and-Classification/blob/main/LSTM_P_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
! unzip "/content/gdrive/My Drive/dataset_toxic_comment/embeddings/7.zip" -d "/content/gdrive/My Drive/dataset_toxic_comment/embeddings"

Archive:  /content/gdrive/My Drive/dataset_toxic_comment/embeddings/7.zip
  inflating: /content/gdrive/My Drive/dataset_toxic_comment/embeddings/meta.json  
  inflating: /content/gdrive/My Drive/dataset_toxic_comment/embeddings/model.bin  
  inflating: /content/gdrive/My Drive/dataset_toxic_comment/embeddings/model.txt  
  inflating: /content/gdrive/My Drive/dataset_toxic_comment/embeddings/README  


In [None]:
import pandas as pd
import numpy as np

In [None]:
path = '/content/gdrive/My Drive/dataset_toxic_comment/'
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")
labels = pd.read_csv(path+"test_labels.csv")

In [None]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
train.isnull().any(),test.isnull().any()

(id               False
 comment_text     False
 toxic            False
 severe_toxic     False
 obscene          False
 threat           False
 insult           False
 identity_hate    False
 dtype: bool, id              False
 comment_text    False
 dtype: bool)

In [None]:
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
wc_path = '/content/gdrive/My Drive/dataset_toxic_comment/embeddings/model.bin'

In [None]:
import gensim.models.keyedvectors as word2vec
import gc

In [None]:
def loadEmbeddingMatrix():
  EMBEDDING_FILE=wc_path
  embed_size = 300
  word2vecDict = word2vec.KeyedVectors.load_word2vec_format(wc_path, binary=True)
  embeddings_index = dict()
  for word in word2vecDict.wv.vocab:
      embeddings_index[word] = word2vecDict.word_vec(word)
  print('Loaded %s word vectors.' % len(embeddings_index))
      
  gc.collect()
  all_embs = np.stack(list(embeddings_index.values()))
  emb_mean,emb_std = all_embs.mean(), all_embs.std()
  
  nb_words = len(tokenizer.word_index)
  embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
  gc.collect()

  embeddedCount = 0
  for word, i in tokenizer.word_index.items():
      i-=1
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None: 
          embedding_matrix[i] = embedding_vector
          embeddedCount+=1
  print('total embedded:',embeddedCount,'common words')
  
  del(embeddings_index)
  gc.collect()
  
  return embedding_matrix

In [None]:
embedding_matrix = loadEmbeddingMatrix()

  


Loaded 273930 word vectors.
total embedded: 43645 common words


In [None]:
embedding_matrix.shape

(210337, 300)

In [None]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D,Bidirectional
from keras.models import Model

In [None]:
inp = Input(shape=(maxlen, ))

In [None]:
x = Embedding(len(tokenizer.word_index), embedding_matrix.shape[1],weights=[embedding_matrix],trainable=False)(inp)
x = Bidirectional(LSTM(60, return_sequences=True,name='lstm_layer',dropout=0.1,recurrent_dropout=0.1))(x)



In [None]:
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

In [None]:
import keras.metrics as metrics

In [None]:
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[
                          metrics.MeanSquaredError(),
                          metrics.AUC(),
                  ])

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 300)          63101100  
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 120)          173280    
_________________________________________________________________
global_max_pooling1d (Global (None, 120)               0         
_________________________________________________________________
dropout (Dropout)            (None, 120)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                6050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0     

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_t,y,test_size=0.2,random_state=1) 

In [None]:
batch_size = 64
epochs = 4
hist = model.fit(X_train,y_train, batch_size=batch_size, epochs=epochs)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
preds_train = model.predict(X_train)

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

print(roc_auc_score(y_train, preds_train))

0.9862511922035293


In [None]:
preds_val = model.predict(X_val)
print(roc_auc_score(y_val, preds_val))

0.9740101626006782


In [None]:
labels = labels[classes]
sum_labels=np.sum(labels.values,axis=1)
# print(sum_labels)
idx=sum_labels>=0
y_test = labels[idx]
X_test = X_te[idx]

In [None]:
preds_test = model.predict(X_test)
print(roc_auc_score(y_test, preds_test))

0.9664648743249146


In [None]:
import os

import tensorflow as tf
from tensorflow import keras

print(tf.version.VERSION)

2.4.1


In [None]:
# Save the entire model as a SavedModel.
model.save('/content/gdrive/My Drive/dataset_toxic_comment/saved_model/LSTM_word2vec') 

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/dataset_toxic_comment/saved_model/LSTM_word2vec/assets


In [None]:
import keras
new_model = keras.models.load_model('/content/gdrive/My Drive/dataset_toxic_comment/saved_model/LSTM_word2vec')

# Check its architecture
new_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 300)          63101100  
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 120)          173280    
_________________________________________________________________
global_max_pooling1d (Global (None, 120)               0         
_________________________________________________________________
dropout (Dropout)            (None, 120)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                6050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0     

In [None]:
labels = labels[classes]
sum_labels=np.sum(labels.values,axis=1)
# print(sum_labels)
idx=sum_labels>=0
y_test = labels[idx]
X_test = X_te[idx]

In [None]:
preds_test = new_model.predict(X_test)

In [None]:
from sklearn.metrics import roc_auc_score, hamming_loss, accuracy_score, log_loss
import numpy as np
print(roc_auc_score(y_test, preds_test))
print("ROC AUC Score: ", roc_auc_score(y_test, preds_test))
print("Hamming Loss: ", hamming_loss(y_test, (preds_test > 0.5)))
print("Log Loss: ", log_loss(y_test, preds_test))
print("Accuracy Score: ", accuracy_score(y_test, (preds_test > 0.5)))

0.9664648743249146
ROC AUC Score:  0.9664648743249146
Hamming Loss:  0.02778819802640491
Log Loss:  0.2897836747210841
Accuracy Score:  0.8898058707680765


In [None]:
# from sklearn.metrics import multilabel_confusion_matrix
# preds_test = (preds_test > 0.5) 
# df = pd.DataFrame(preds_test, columns = ['toxic','severe_toxic','obscence', 'threat', 'insult', 'identity_hate'])
# matrix = multilabel_confusion_matrix(y_test, df)
# print(matrix)

Above code is used to save and load model again 