In [5]:
import os 
import pandas as pd 
import numpy as np 
import tensorflow as tf 
import re
import string


Taking a good look at the dataset and converting it into a dataframe

In [6]:
df = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv/train.csv')
df.head(50)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


Let's look at how many datapoints we have

In [7]:
df.shape

(159571, 8)

Taking a look at the columns of the dataframe

In [8]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

Let's look at the comments for a bit

In [9]:
df['comment_text']

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

Cleaning the data or the comment by using custom made function that uses regex ( regular expressions ) to remove punctuations and separating words for the model to understand better. 

In [12]:
def  clean_text(text):
    text =  text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'d", " would", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()#/@;:{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub("(\W)"," ",text) 
    text = re.sub('S*dS*s*','', text)
    return text
df["comment_text"] = df['comment_text'].apply(lambda text: clean_text(text))

Looking at the transformed comments

In [13]:
df['comment_text']

0         explanation why the eits mae une my usename ha...
1         aww he matches this backgoun colou i am seemin...
2         hey man i am eally not tying to eit wa it is j...
3          moe i cannot make any eal suggestions on impo...
4         you si ae my heo any chance you emembe what pa...
                                ...                        
159566    an fo the secon time of asking when you view c...
159567    you shoul be ashame of youself   that is a hoi...
159568    spitze   umm thees no actual aticle fo postitu...
159569    an it looks like it was actually you who put o...
159570     an  i eally o not think you unestan  i came h...
Name: comment_text, Length: 159571, dtype: object

Let's remove the stopwords 

In [14]:
def stopwords_removal(comment):
    stop_words = [
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'as', 'at',
    'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by',
    'can', 'could',
    'did', 'do', 'does', 'doing', 'down', 'during',
    'each',
    'few', 'for', 'from', 'further',
    'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how',
    'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself',
    'just',
    'me', 'more', 'most', 'my', 'myself',
    'no', 'nor', 'not', 'now',
    'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
    'same', 'she', 'should', 'so', 'some', 'such',
    'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this',
    'those', 'through', 'to', 'too',
    'under', 'until', 'up', 'us',
    'very',
    'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with',
    'you', 'your', 'yours', 'yourself', 'yourselves','much'
    ]
    filtered = []
    for word in comment.split():
        if word.lower() not in stop_words:
            filtered.append(word)
    filtered_comment = ' '.join(filtered)
    return filtered_comment 

In [15]:
df["comment_text"] = df['comment_text'].apply(stopwords_removal)

After applying the stopwords_removal function let's look at our final comment text and assign it as x 

In [16]:
df['comment_text']

0         explanation eits mae une usename hacoe metalli...
1         aww matches backgoun colou seemingly stuck tha...
2         hey man eally tying eit wa guy constantly emov...
3         moe cannot make eal suggestions impovement won...
4                              si ae heo chance emembe page
                                ...                        
159566    fo secon time asking view completely contaicts...
159567    shoul ashame youself hoible thing put talk pag...
159568    spitze umm thees actual aticle fo postitution ...
159569    looks like actually put speey fist vesion elet...
159570    eally o think unestan came hee iea ba ight awa...
Name: comment_text, Length: 159571, dtype: object

In [17]:
from tensorflow.keras.layers import TextVectorization
x = df['comment_text']
y = df[df.columns[2:]].values

In [18]:
max_features = 200000 #vocabulary
vectorizer = TextVectorization(max_tokens=max_features,
                              output_sequence_length=1800,
                              output_mode='int')
vectorizer.adapt(x.values)





In [12]:
vectorized = vectorizer(x.values)

In [13]:
vectorized

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [14]:
#mcshbap - map,cache,shuffle, batch,prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [15]:
dataset.as_numpy_iterator().next()
#text in vectorized format and the labels ( the zeroes and ones )

(array([[    70,     15,   4242, ...,      0,      0,      0],
        [167280, 107443, 161662, ...,      0,      0,      0],
        [   242,   1967,      0, ...,      0,      0,      0],
        ...,
        [   124,      7,     13, ...,      0,      0,      0],
        [     7,     55,    105, ...,      0,      0,      0],
        [  1955,  24645,      8, ...,      0,      0,      0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

In [16]:
#lets save them in variables 
batch_x, batch_y = dataset.as_numpy_iterator().next()
print("Shape of batch_x : ",batch_x.shape)
print("Shape of batch_y : ",batch_y.shape)

Shape of batch_x :  (16, 1800)
Shape of batch_y :  (16, 6)


Splitting

In [17]:
#lets look at the length of the dataset
print(len(dataset))
print("Training dataset length :",int(len(dataset)*.7))

9974
Training dataset length : 6981


In [18]:
train = dataset.take(int(len(dataset)*.7))
valid = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

Progressively stepping through the batches

In [19]:
train_generator = train.as_numpy_iterator()
train_generator.next()

(array([[   51,    37,     7, ...,     0,     0,     0],
        [    3, 45685, 26501, ...,     0,     0,     0],
        [   49,   548,  2437, ...,     0,     0,     0],
        ...,
        [   41,    20,     4, ...,     0,     0,     0],
        [  151,    12,  1082, ...,     0,     0,     0],
        [   54,   201,    17, ...,     0,     0,     0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

creating model

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dropout,Bidirectional,Embedding,Dense

We will first create an embedding layer. +1 is because of unknown word. 

In [21]:
model = Sequential()
model.add(Embedding(max_features+1,32))
model.add(Bidirectional(LSTM(32,activation='tanh')))
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(6,activation='sigmoid'))

In [22]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                       

In [22]:
historical_model = model.fit(train,epochs=5,validation_data=valid)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [4]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(10,8))
pd.DataFrame(historical_model.history).plot()
plt.show()

In [24]:
batch = test.as_numpy_iterator().next()
input_text = vectorizer('I fucking hate you')
model.predict(np.array([input_text]))



array([[0.9992241 , 0.56923914, 0.9911504 , 0.02571411, 0.8058593 ,
        0.07306851]], dtype=float32)

In [25]:
np.expand_dims(input_text,0)

array([[  8, 382, 363, ...,   0,   0,   0]], dtype=int64)

In [26]:
result = model.predict(np.expand_dims(input_text,0))



In [27]:

batch_x, batch_y = test.as_numpy_iterator().next()
preds = model.predict(batch_x)




In [28]:
preds.shape

(16, 6)

In [29]:
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy
precision = Precision()
recall = Recall()
cat_accuracy = CategoricalAccuracy() 

In [30]:
for batch in test.as_numpy_iterator():
    x_true,y_true = batch
    y_cap = model.predict(x_true)
    y_true = y_true.flatten()
    y_cap = y_cap.flatten()
    
    precision.update_state(y_true,y_cap)
    recall.update_state(y_true,y_cap)
    cat_accuracy.update_state(y_true,y_cap)
    
print(f'Precision:{precision.result().numpy()},Recall : {recall.result().numpy()},Categorical Accuracy : {cat_accuracy.result().numpy()}')

Precision:0.8615431785583496,Recall : 0.8226557970046997,Categorical Accuracy : 0.5045135617256165


In [42]:
model.save('toxic_comment_classifier.h5')

  saving_api.save_model(


In [23]:
model = tf.keras.models.load_model('toxic_comment_classifier.h5')

In [31]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    text = ''
    for idx,col in enumerate(df.columns[2:-1]):
        text += '{}:{}\n'.format(col,results[0][idx] > 0.5)
    return text

In [37]:
import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from joblib import load

In [39]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [38]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    text = ''
    for idx,col in enumerate(df.columns[2:]):
        text += '{}:{}\n'.format(col,results[0][idx]>0.5)
    return text
def main():
    st.title("Comment Scoring App")
    
    comment = st.text_area("Enter your comment:", placeholder="Type your comment here...")
    
    if st.button("Score Comment"):
        results_text = score_comment(comment)
        st.text(results_text)

if __name__ == "__main__":
    main()

2024-01-18 23:53:12.752 
  command:

    streamlit run C:\Users\Aniket\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]


AttributeError: module 'gradio' has no attribute 'inputs'