In [1]:
import nltk
import string, re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
!pip install tensorflow pandas matplotlib scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df = pd.read_csv('/content/drive/MyDrive/IR_Assignment/Project/train.csv')

In [6]:
#normalization
def lower_case(lines):
  lines=lines.lower()
  return lines

#remove punctuations
def remove_punctuation(text):
  text="".join([char for char in text if char not in string.punctuation])
  text=re.sub('[0-9]+', '', text)
  return text

#removing stopwords
def remove_stpwords(text):
  text = [word for word in text if word not in stopwords]
  return text

#creating tokens
def tokenization(text):
  text=text.strip()
  text = re.split('\W+', text)
  return text

#porter stemming
def stemming(text):
  text = [nltk.PorterStemmer().stem(word) for word in text]
  return text

#lemmatization
def lemmatizer(text):
  text = [nltk.WordNetLemmatizer().lemmatize(word) for word in text]
  return text

In [7]:
#Cleaning the document and removing punctuation,stopwords,lowering the case,tokenizing, stemming and performing lemmatization
def cleaning_doc(lines):
  text = remove_punctuation(lines)
  text = lower_case(text)
  text_tockens_created = tokenization(text)
  text_tockens_created = remove_stpwords(text_tockens_created)
  text_tockens_created = stemming(text_tockens_created)
  text_tockens_created = lemmatizer(text_tockens_created)
  return text_tockens_created

In [8]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
#Preprocessing

In [10]:
from tensorflow.keras.layers import TextVectorization

In [11]:
x = df['comment_text']

y = df[df.columns[2:]].values

In [12]:
import sys
print(sys.getrecursionlimit()) # Prints 1000

1000


In [13]:
print(x)

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object


In [14]:
print(y)

[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [15]:
MAX_FEATURES=200000 #number of words in the vocab

In [16]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length = 1800, output_mode='int')

In [17]:
vectorizer.adapt(x.values) #trained vectorizer to learn our vocab

In [18]:
vectorizer("Hello world life is amazing")

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([288, 263, 306, ...,   0,   0,   0])>

In [19]:
vectorized_text = vectorizer(x.values) #tokenized every single word in the vocab

In [20]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [21]:
#creating a tensorflow data pipeline
#data pipeline steps - MCSHABAP Map, Cache, SHuffle, BAtch, Prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16) # divided into batches
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [22]:
batch_x, batch_y = dataset.as_numpy_iterator().next()
#batch represented as text + labels

In [23]:
print(len(dataset)) #this many number of batches now
print(len(dataset)*16) #actual number
#idea of shape
print(batch_x.shape)
print(batch_y.shape)

9974
159584
(16, 1800)
(16, 6)


In [24]:
train = dataset.take(int(len(dataset)*0.7)) #"take" that partition out as train data for training. Taking 70% data for train
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [25]:
print(len(train))
print(len(val))
print(len(test))

6981
1994
997


CREATE SEQUENTIAL MODEL

In [26]:
#embedding layer groups similar words; creates word vectors and checks the angles between them
#Personality test for the word: Words having similar scores are grouped e.g. great and good

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D, concatenate

MAX_FEATURES = 10000
EMBEDDING_DIM = 100

model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=MAX_FEATURES+1, output_dim=EMBEDDING_DIM))

# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(units=128, activation='tanh', return_sequences=True)))
model.add(Dropout(0.2))

# Convolutional layer
model.add(Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'))
model.add(GlobalMaxPooling1D())

# Dense layers
model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.2))

# Output layer
model.add(Dense(units=6, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1000100   
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        234496    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, None, 256)         0         
                                                                 
 conv1d (Conv1D)             (None, None, 64)          81984     
                                                                 
 global_max_pooling1d (Globa  (None, 64)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 128)               8

In [29]:
'''
model = Sequential()
#creating the embedding layer
model.add(Embedding(MAX_FEATURES+1,32))
#bidirectional: i dont like you (like read last; has + connotation) vs other direcion reading: you like dont i - connotation
#Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32,activation='tanh')))
#feature extractor fully connected layers
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
#final layer
model.add(Dense(6,activation='sigmoid'))'''

"\nmodel = Sequential()\n#creating the embedding layer\nmodel.add(Embedding(MAX_FEATURES+1,32))\n#bidirectional: i dont like you (like read last; has + connotation) vs other direcion reading: you like dont i - connotation\n#Bidirectional LSTM layer\nmodel.add(Bidirectional(LSTM(32,activation='tanh')))\n#feature extractor fully connected layers\nmodel.add(Dense(128,activation='relu'))\nmodel.add(Dense(256,activation='relu'))\nmodel.add(Dense(128,activation='relu'))\n#final layer\nmodel.add(Dense(6,activation='sigmoid'))"

In [30]:
#model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1000100   
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        234496    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, None, 256)         0         
                                                                 
 conv1d (Conv1D)             (None, None, 64)          81984     
                                                                 
 global_max_pooling1d (Globa  (None, 64)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 128)               8

In [None]:
history = model.fit(train,epochs=5,validation_data=val)

Epoch 1/5
Epoch 2/5
 843/6981 [==>...........................] - ETA: 13:00 - loss: 0.0515 - accuracy: 0.9944

In [None]:
history.history

In [None]:
import matplotlib.pyplot as plt

# Get the training and validation loss and accuracy from the history dictionary
loss = history.history['loss']
val_loss = history.history['val_loss']
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))

# Plot the training and validation loss on the first subplot
ax1.plot(loss, label='Training Loss')
# ax1.plot(val_loss, label='Validation Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss')
ax1.legend()

# Plot the training and validation accuracy on the second subplot
ax2.plot(accuracy, label='Training Accuracy')
# ax2.plot(val_accuracy, label='Validation Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training Accuracy')
ax2.legend()

# Display the plot
plt.show()

In [None]:
batch_x, batch_y = test.as_numpy_iterator().next()

In [None]:
df.columns[2:]

In [None]:
(model.predict(batch_x) > 0.5).astype(int)

In [None]:
#Evaluation of model

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
  #unpack the batch
  x_true, y_true = batch
  #make a prediction
  y_that = model.predict(x_true)

  #flatten predictions
  y_true = y_true.flatten()
  y_that = y_that.flatten()

  pre.update_state(y_true, y_that)
  re.update_state(y_true, y_that)
  acc.update_state(y_true, y_that)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

In [None]:
pres = pre.result().numpy()
reca = re.result().numpy()

In [None]:
f1 = 2*pres*reca/(pres+reca)

In [None]:
f1

In [None]:
#APP

In [None]:
!pip install gradio jinja2

In [None]:
import tensorflow as tf
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
model = tf.keras.models.load_model('toxicity.h5')

In [None]:
def score_comment(comment):
  vectorized_comment = vectorizer([comment])
  results = model.predict(vectorized_comment)

  text = ''
  for idx, col in enumerate(df.columns[2:-1]):
    text += '{}: {}\n'.format(col, results[0][idx]>0.5)

  return text

In [None]:
interface = gr.Interface(fn=score_comment, inputs = gr.inputs.Textbox(lines=2, placeholder = 'Comment to score'), outputs = 'text')

In [None]:
interface.launch(share=True)