# Natural Language Processing project on toxic comments data

## Import Libraries

In [1]:
import pandas as pd   #For data frames, reading data, data processing, and analysis
import numpy as np   #For numerical computations
import string   # For string operations and constants.
import re   # Used for regular expression matching and operations.
import nltk   # Import Natural language Toolkit
from nltk.corpus import stopwords   # Import a list of common stopwords
from nltk.tokenize import word_tokenize   # For tokenization
from nltk.stem import SnowballStemmer   # For stemming
from sklearn.feature_extraction.text import CountVectorizer     # convert collection of text into matrix of token counts
from gensim.models import Word2Vec      # For word embeddings
from sklearn.feature_extraction.text import TfidfVectorizer     # For word embeddings
from sklearn.model_selection import train_test_split     # For splitting data into train and test
from sklearn.metrics import classification_report       # For model evaluation
from keras.models import Model      # Import Keras Model class
from keras.layers import Input, Dense       # For model architecture
from transformers import BertTokenizer, TFBertModel     # Import BERT modules
import tensorflow as tf         # Import Tensorflow
from keras.utils import to_categorical       # For one-hot encoding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download NLTK tokenizer data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Download NLTK Stop Words data
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Exploring data

In [4]:
# Read Data
data = pd.read_csv('C:\\Users\\LENOVO\\Desktop\\train.csv')

In [5]:
# Display Data
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [6]:
# Display each column with its number of nulls and data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


    - There are no nulls
    - All are int columns except columns [id, comment_text] which are object types
    - Total 159571 rows and 8 columns

In [7]:
# Display int columns and their counts, mean, standard deviation, minimum, maximum,and three quantiles
data.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Display shape of data
data.shape

(159571, 8)

    - 159571 rows and 8 columns

In [9]:
# Display types
data.dtypes

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

In [10]:
# Check for nulls data
data.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

        - No nulls

In [11]:
# Display number of unique values in each column
data.nunique()

id               159571
comment_text     159571
toxic                 2
severe_toxic          2
obscene               2
threat                2
insult                2
identity_hate         2
dtype: int64

In [12]:
# Display number of values in column [toxic]
data['toxic'].value_counts()

toxic
0    144277
1     15294
Name: count, dtype: int64

In [13]:
# Display number of values in column [severe toxic]
data['severe_toxic'].value_counts()

severe_toxic
0    157976
1      1595
Name: count, dtype: int64

In [14]:
# Display number of values in column [obscene]
data['obscene'].value_counts()

obscene
0    151122
1      8449
Name: count, dtype: int64

In [15]:
# Display number of values in column [threat]
data['threat'].value_counts()

threat
0    159093
1       478
Name: count, dtype: int64

In [16]:
# Display number of values in column [insult]
data['insult'].value_counts()

insult
0    151694
1      7877
Name: count, dtype: int64

In [17]:
# Display number of values in column [identity hate]
data['identity_hate'].value_counts()

identity_hate
0    158166
1      1405
Name: count, dtype: int64

### Splitting data into appropriate and inappropriate comments for more exploring

#### Display the data which has appropriate comments

In [18]:
# Display data which has no 1 in any of the binary columns as the (zerosData)
zerosData = (data["toxic"]==0)&(data["severe_toxic"]==0)&(data["obscene"]==0)&(data["threat"]==0)&(data["insult"]==0)&(data["identity_hate"]==0)

# Put it in variable (Appropriate_comments)
Appropriate_comments = data[zerosData]

# Reset the index of the 'Appropriate_comments'
Appropriate_comments.reset_index(drop=True, inplace=True)
Appropriate_comments

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
143341,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
143342,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
143343,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
143344,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


#### Display the data which has inappropriate comments

In [19]:
# Display data which has 1 in one or more of the binary columns as the (onesData)
onesData = (data["toxic"]==1)|(data["severe_toxic"]==1)|(data["obscene"]==1)|(data["threat"]==1)|(data["insult"]==1)|(data["identity_hate"]==1)

# Put it in variable (Inappropriate_comments)
Inappropriate_comments = data[onesData]

# Reset the index of the 'Inappropriate_comments'
Inappropriate_comments.reset_index(drop=True, inplace=True)
Inappropriate_comments

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
1,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
2,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
3,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
4,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
16220,fef4cf7ba0012866,"""\n\n our previous conversation \n\nyou fuckin...",1,0,1,0,1,1
16221,ff39a2895fc3b40e,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1,0,0,0,1,0
16222,ffa33d3122b599d6,Your absurd edits \n\nYour absurd edits on gre...,1,0,1,0,1,0
16223,ffb47123b2d82762,"""\n\nHey listen don't you ever!!!! Delete my e...",1,0,0,0,1,0


#### Get the percentage of them

In [20]:
# get the number of rows from both dataframes
num_appropriate = data[zerosData].shape[0]
num_inappropriate = data[onesData].shape[0]

# divide the number by whole length of data * 100 to get percentage
Percent_appropriate_data = (num_appropriate / len(data))*100
Percent_inappropriate_data = (num_inappropriate / len(data))*100

print("Percentage of appropriate data is: ", Percent_appropriate_data)
print("Percentage of inappropriate data is: ", Percent_inappropriate_data)

Percentage of appropriate data is:  89.83211235124176
Percentage of inappropriate data is:  10.167887648758233


## Cleaning Data

### convert data to lower case

In [21]:
data['comment_text'] = data['comment_text'].str.lower()

In [22]:
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation\nwhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nmore\ni can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::and for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself \n\nthat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"spitzer \n\numm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


### Remove duplicated rows according to column [comment_text]

In [23]:
data.drop_duplicates(subset='comment_text', inplace=True)

In [24]:
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation\nwhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nmore\ni can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::and for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself \n\nthat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"spitzer \n\numm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


### Remove URLs from column [comment_text]

#### Check how many rows which have URL

In [25]:
url_pattern = r'https?://\S+|www\.\S+'

# Count rows with URLs in the 'comment_text' column
rows_with_urls = data['comment_text'].str.contains(url_pattern, case=False, regex=True).sum()

# Display the number of rows with URLs
rows_with_urls

5113

#### Remove URLs from each column

In [26]:
data['comment_text'] = data['comment_text'].str.replace(url_pattern, '', case=False, regex=True)
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation\nwhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nmore\ni can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::and for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself \n\nthat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"spitzer \n\numm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


#### Check if the URL removed successfully from rows

In [27]:
# Display number of rows after removing URL
rows_without_urls = data['comment_text'].str.contains(url_pattern, case=False, regex=True).sum()
rows_without_urls

0

### Return words to is basic form: Expand contractions

In [28]:
# Some words are written in an abbreviated way
# Will replace it with the basic form (expand it) using the following function:

def Expand_abbreviation(text):
    text = re.sub(r"what's", "what is ", text)  # Replace "what's" with "what is"
    text = re.sub(r"\'s", " ", text)  # Replace "'s" with a space in the text
    text = re.sub(r"\'ve", " have ", text)  # Replace "'ve" with " have " in the text
    text = re.sub(r"can't", "cannot ", text)  # Replace "can't" with "cannot"
    text = re.sub(r"n't", " not ", text)  # Replace "n't" with " not "
    text = re.sub(r"i'm", "i am ", text)  # Replace "i'm" with "i am"
    text = re.sub(r"\'re", " are ", text)  # Replace "'re" with " are "
    text = re.sub(r"\'d", " would ", text)  # Replace "'d" with " would "
    text = re.sub(r"\'ll", " will ", text)  # Replace "'ll" with " will "
    text = re.sub(r"\'scuse", " excuse ", text)  # Replace "'scuse" with " excuse "
    text = re.sub('\W', ' ', text)   # Replace any non-word characters with a space
    text = re.sub('\s+', ' ', text)  # Replace consecutive whitespace characters with a space
    text = text.strip(' ')  # Remove trailing spaces
    return text

# Update the column in data with the function
data['comment_text'] = data['comment_text'].map(lambda x : Expand_abbreviation(x))
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i am s...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cannot make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your vi...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


### Remove special characters in the column [comment_text]

#### Check how many rows which have special charcters

In [29]:
# Define a pattern of special characters
pattern_spec_char = r'[^A-Za-z0-9\s]+'

# Function that search for the special characters in rows
# The function returns  True if the  text  contains special characters. Otherwise, it returns  False
def num_special_characters(text):
    return bool(re.search(pattern_spec_char, text))

# Apply the function to the 'comment_text' column and get the number of rows that have special characters
num_rows_with_special_characters = data['comment_text'].apply(num_special_characters).sum()
num_rows_with_special_characters

6189

#### Remove special characters

In [30]:
# Function to remove special characters
def remove_special_characters(text):
    return re.sub(pattern_spec_char, '', text)

# Apply the remove_special_characters function to each row in [comment_text] column
data['comment_text'] = data['comment_text'].apply(remove_special_characters)
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i am s...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cannot make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your vi...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


#### Check if the special characters removed successfully from rows

In [31]:
# Display number of rows after removing special characters
num_rows_without_special_characters = data['comment_text'].apply(num_special_characters).sum()
num_rows_without_special_characters

0

### Remove the numeric and punctiuation data in the column[comment_text]

#### Check how many rows which have numeric or punctiuation

In [32]:
# Define a pattern of numerical and punctuation
numeric_or_punctuation_pattern = r'[\d' + string.punctuation + '\n]+'

# Function that search for the numeric or punctiuation in rows
# The function returns  True if the  text  contains numeric or punctiuation. Otherwise, it returns  False
def num_numeric_or_punctuation(text):
    return bool(re.search(numeric_or_punctuation_pattern, text))

# Apply the function to the 'comment_text' column and get the number of rows that have numeric or punctiuation
num_rows_with_numeric_or_punctuation = data['comment_text'].apply(num_numeric_or_punctuation).sum()
num_rows_with_numeric_or_punctuation

49913

#### Remove numeric or punctiuation

In [33]:
# Function to remove numeric [\d] and punctiuation and the new line [\n]

def remove_numeric_punctuation(data):
    data['comment_text'] = data['comment_text'].apply(lambda x: re.sub(numeric_or_punctuation_pattern, ' ', x)) # Replace them with empty string
    return data

# Apply the remove_numeric_punctuation function to each row in [comment_text] column
data = remove_numeric_punctuation(data)
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i am s...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cannot make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your vi...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


#### Check if the numeric and punctiuation removed successfully from rows

In [34]:
# Display number of rows after removing numeric and punctiuation

num_rows_without_numeric_or_punctuation = data['comment_text'].apply(num_numeric_or_punctuation).sum()
num_rows_without_numeric_or_punctuation

0

## Remove stop words

### Check how many rows which have stop words

In [35]:
# Collect all stop words and save it to variable (stop_words)
stop_words = set(stopwords.words('english'))

# Function to check for number of stop words
def num_stop_words(text):
    tokens = nltk.word_tokenize(text)
    return any(token in stop_words for token in tokens)

# Count the number of rows with stop words
num_rows_with_stop_words = data['comment_text'].apply(num_stop_words).sum()
num_rows_with_stop_words

155703

### Remove stop words

In [36]:
# Function to remove stop words

def remove_stop_words(text):
    # tokenizes the  text  into individual words using the  word_tokenize  function.
    word_tokens = word_tokenize(text)

# collect all words not included in stop_word list by iterating over each word in word_tokens and put it in filtered_text
    filtered_text = [word for word in word_tokens if word not in stop_words]

    # Return the filtered text
    return ' '.join(filtered_text)

In [37]:
# Apply the function to the column [comment_text]
data['comment_text'] = data['comment_text'].apply(remove_stop_words)
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,aww matches background colour seemingly stuck ...,0,0,0,0,0,0
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0


### Check if stop words removed successfully from rows

In [38]:
# Display number of rows after removing stop words

num_rows_without_stop_words = data['comment_text'].apply(num_stop_words).sum()
num_rows_without_stop_words

0

## Tokenize comment_text

In [39]:
# Tokenize data using function word_tokenize

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

In [40]:
# Create new column in the data that have the tokenized comment
data['tokenized_comment'] = data['comment_text'].apply(tokenize_text)

data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_comment
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0,"[explanation, edits, made, username, hardcore,..."
1,000103f0d9cfb60f,aww matches background colour seemingly stuck ...,0,0,0,0,0,0,"[aww, matches, background, colour, seemingly, ..."
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con..."
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0,"[make, real, suggestions, improvement, wondere..."
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0,"[sir, hero, chance, remember, page]"
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0,"[second, time, asking, view, completely, contr..."
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0,"[ashamed, horrible, thing, put, talk, page]"
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0,"[spitzer, umm, theres, actual, article, prosti..."
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0,"[looks, like, actually, put, speedy, first, ve..."


## Stemming tokenized text

In [41]:
# Initialize the SnowballStemmer
stemmer = SnowballStemmer('english')

# Function applies stemming to each token in the input list using a list comprehension and returns as list
def stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

In [42]:
# Create new column in the data that have the stemming comment

# Apply stemming to the tokenized comments
data['Stemming'] = data['tokenized_comment'].apply(stemming)

data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_comment,Stemming
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0,"[explanation, edits, made, username, hardcore,...","[explan, edit, made, usernam, hardcor, metalli..."
1,000103f0d9cfb60f,aww matches background colour seemingly stuck ...,0,0,0,0,0,0,"[aww, matches, background, colour, seemingly, ...","[aww, match, background, colour, seem, stuck, ..."
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con...","[hey, man, realli, tri, edit, war, guy, consta..."
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0,"[make, real, suggestions, improvement, wondere...","[make, real, suggest, improv, wonder, section,..."
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0,"[sir, hero, chance, remember, page]","[sir, hero, chanc, rememb, page]"
...,...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0,"[second, time, asking, view, completely, contr...","[second, time, ask, view, complet, contradict,..."
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0,"[ashamed, horrible, thing, put, talk, page]","[asham, horribl, thing, put, talk, page]"
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0,"[spitzer, umm, theres, actual, article, prosti...","[spitzer, umm, there, actual, articl, prostitu..."
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0,"[looks, like, actually, put, speedy, first, ve...","[look, like, actual, put, speedi, first, versi..."


## Bag Of Words

### Convert the Stemming column [list of words] to String and put it in column [Stemming_String]

In [43]:
data['Stemming_String'] = data['Stemming'].apply(lambda tokens: ' '.join(tokens))
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_comment,Stemming,Stemming_String
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0,"[explanation, edits, made, username, hardcore,...","[explan, edit, made, usernam, hardcor, metalli...",explan edit made usernam hardcor metallica fan...
1,000103f0d9cfb60f,aww matches background colour seemingly stuck ...,0,0,0,0,0,0,"[aww, matches, background, colour, seemingly, ...","[aww, match, background, colour, seem, stuck, ...",aww match background colour seem stuck thank t...
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con...","[hey, man, realli, tri, edit, war, guy, consta...",hey man realli tri edit war guy constant remov...
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0,"[make, real, suggestions, improvement, wondere...","[make, real, suggest, improv, wonder, section,...",make real suggest improv wonder section statis...
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0,"[sir, hero, chance, remember, page]","[sir, hero, chanc, rememb, page]",sir hero chanc rememb page
...,...,...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0,"[second, time, asking, view, completely, contr...","[second, time, ask, view, complet, contradict,...",second time ask view complet contradict covera...
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0,"[ashamed, horrible, thing, put, talk, page]","[asham, horribl, thing, put, talk, page]",asham horribl thing put talk page
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0,"[spitzer, umm, theres, actual, article, prosti...","[spitzer, umm, there, actual, articl, prostitu...",spitzer umm there actual articl prostitut ring...
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0,"[looks, like, actually, put, speedy, first, ve...","[look, like, actual, put, speedi, first, versi...",look like actual put speedi first version dele...


### Create Bag Of Words

#### Display as sparse matrix

In [44]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the stemmed string column to create BoW matrix
bow_matrix = vectorizer.fit_transform(data['Stemming_String'])

print(bow_matrix)


  (0, 37128)	1
  (0, 33259)	1
  (0, 67215)	1
  (0, 119667)	1
  (0, 47773)	1
  (0, 71213)	1
  (0, 37892)	1
  (0, 95045)	1
  (0, 120266)	1
  (0, 21362)	1
  (0, 42619)	1
  (0, 122168)	1
  (0, 77320)	1
  (0, 127935)	1
  (0, 31021)	1
  (0, 37455)	1
  (0, 87024)	1
  (0, 94205)	1
  (0, 111885)	1
  (0, 110720)	1
  (0, 83015)	1
  (0, 103119)	1
  (0, 94915)	1
  (1, 110720)	1
  (1, 8633)	1
  :	:
  (159522, 25322)	1
  (159522, 106059)	1
  (159523, 1010)	1
  (159523, 39467)	1
  (159523, 27894)	1
  (159523, 105822)	1
  (159523, 65992)	2
  (159523, 64828)	1
  (159523, 121093)	1
  (159523, 90757)	1
  (159524, 92927)	1
  (159524, 54344)	1
  (159524, 113181)	1
  (159524, 44537)	1
  (159524, 17017)	1
  (159524, 9084)	2
  (159524, 118064)	1
  (159524, 48875)	1
  (159524, 44656)	1
  (159524, 95566)	1
  (159524, 60535)	1
  (159524, 22410)	1
  (159524, 8575)	2
  (159524, 52186)	2
  (159524, 95132)	1


##### Explaining output

        -the output indicates the position in the matrix where the value is located
        -First value is the row index
        -Second value (The column index) is the word index in vocabulary
        -Third value represents the frequency of the word in that row

#### Display as Dataframe
it doesn't display all the data as fatures too large which leads to Memory Error
so in order to convert it to array then display it as datframe, we will choose max_features that frequently used in dataset (after 2000, it gives me memory error)

In [45]:
# Initialize the CountVectorizer with max_features parameter
vectorizer_df = CountVectorizer(max_features=2000)  # Choose number of most frequent features

# Fit and transform the stemmed string column to create bow matrix
bow_matrix_dataframe = vectorizer_df.fit_transform(data['Stemming_String'])

# Create a new dataframe from bow_matrix
bow_df = pd.DataFrame(bow_matrix_dataframe.toarray(), columns=vectorizer_df.get_feature_names_out())

# Display the new dataframe
bow_df

Unnamed: 0,abil,abl,absolut,absurd,abus,academ,accept,access,accord,account,...,yeah,year,yes,yesterday,yet,york,young,yourselfgo,youtub,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
159521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
159522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
159523,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Explaining output

    -Rows represent all rows of data
    -Columns are the most frequent 5000 words
    -The values in the cells refers to how many times each word appears in each row

#### Display each word occured how many times in general

In [46]:
# make empty dictionary to store the list of tokens and their counts in
bow = {}

# Iterate on each row in data
for index, row in data.iterrows():

    # Split Stemming_String column into tokens
    tokens = row['Stemming_String'].split()
    for token in tokens:
        if token in bow:
                # if token is present in bow dictionary, it increases the count of that token by 1
            bow[token] += 1
        else:
                # if not found, it adds the token to bow dictionary
            bow[token] = 1

# Create a DataFrame from the bag of words dictionary and form 2 columns
bow_mat = pd.DataFrame(bow.items(), columns=['Token', 'Count'])

# Print the DataFrame
bow_mat

Unnamed: 0,Token,Count
0,explan,2073
1,edit,41531
2,made,9681
3,usernam,1929
4,hardcor,166
...,...,...
129323,webaddress,1
129324,gratest,1
129325,hanumakonda,1
129326,automak,1


##### Explaining output

    -Dataframe of 2 columns, represent number of all words in the data (129328) and how many each word occurred in the data in general

## Word Embeddings

### TF-IDF as Dataframe

In [47]:
# Create a TF-IDF vectorizer with max_features to be able to display it as dataframe
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform data
tfidf_embeddings = tfidf_vectorizer.fit_transform(data)

# Convert the TF-IDF matrix to data frame
tfidf_df = pd.DataFrame(tfidf_embeddings.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the TF-IDF data frame
print(tfidf_df)

    comment_text   id  identity_hate  insult  obscene  severe_toxic  stemming  \
0            0.0  1.0            0.0     0.0      0.0           0.0       0.0   
1            1.0  0.0            0.0     0.0      0.0           0.0       0.0   
2            0.0  0.0            0.0     0.0      0.0           0.0       0.0   
3            0.0  0.0            0.0     0.0      0.0           1.0       0.0   
4            0.0  0.0            0.0     0.0      1.0           0.0       0.0   
5            0.0  0.0            0.0     0.0      0.0           0.0       0.0   
6            0.0  0.0            0.0     1.0      0.0           0.0       0.0   
7            0.0  0.0            1.0     0.0      0.0           0.0       0.0   
8            0.0  0.0            0.0     0.0      0.0           0.0       0.0   
9            0.0  0.0            0.0     0.0      0.0           0.0       1.0   
10           0.0  0.0            0.0     0.0      0.0           0.0       0.0   

    stemming_string  threat

#### Explaining Output

    - TF-IDF DataFrame provides a numerical representation of the importance of words in each document
    - Each column in the DataFrame corresponds to a unique word in data
    - Rows = rows of all the data
    - Each cell contains a TF-IDF score that shows the importance of a word in a row compared to its importance in all data

### TF-IDF as sparse matrix

In [48]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the stemmed string column to create tfidf matrix
tfidf_mat = tfidf_vectorizer.fit_transform(data['Stemming_String'])
print(tfidf_mat)

  (0, 94915)	0.24886411346186896
  (0, 103119)	0.14256138093515247
  (0, 83015)	0.09190981743690825
  (0, 110720)	0.0953804621875277
  (0, 111885)	0.1752829691641729
  (0, 94205)	0.12586541775998042
  (0, 87024)	0.1040757936823214
  (0, 37455)	0.25854165220328196
  (0, 31021)	0.33282520006335103
  (0, 127935)	0.22582500651782347
  (0, 77320)	0.1401889860474827
  (0, 122168)	0.1981584004682601
  (0, 42619)	0.2677110144603826
  (0, 21362)	0.29840264995938265
  (0, 120266)	0.1413052025807894
  (0, 95045)	0.14279726940393273
  (0, 37892)	0.20995326161128855
  (0, 71213)	0.33735993155431515
  (0, 47773)	0.29498526917705326
  (0, 119667)	0.20408797173539048
  (0, 67215)	0.13943683631954326
  (0, 33259)	0.10066480870677873
  (0, 37128)	0.19508751720957498
  (1, 119823)	0.22062019877144534
  (1, 56379)	0.3165210748827097
  :	:
  (159522, 6827)	0.10236747558168392
  (159522, 1010)	0.1750935142385737
  (159523, 90757)	0.3176209272670508
  (159523, 121093)	0.3794329804511375
  (159523, 64828)	0.2

#### Explaining output

    -Most values are zeros because most words from the vocabulary won't appear in each row
    -The values in the matrix represent the TF-IDF scores for each word in each row

### Word2Vec

In [None]:
# Put stemming string column into stemmed_sentences
stemmed_strings = data['Stemming_String']

# Split each stemmed string into a list of words
String_split = [string.split() for string in stemmed_strings]

# Train the Word2Vec model
model = Word2Vec(String_split)

# Get the word embeddings for all words
word_embeddings = model.wv

# Collect word embeddings in a list
embedding_list = []

# Collect words in a list
word_list = []

# These loop iterates through each word in the vocabulary of the model
for word in word_embeddings.key_to_index:
    embedding = word_embeddings[word]
        # For each word, it collect the word's embedding vector and put it in embedding_list
    embedding_list.append(embedding)
        # It collects the words and put them in word_list
    word_list.append(word)

# Create a dataframe from the embedding list and make words as index column
embedding_dataFrame = pd.DataFrame(embedding_list, index=word_list)

# Display the dataframe
embedding_dataFrame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
articl,0.219078,-0.405554,1.016264,2.308288,-1.326432,-0.139032,1.621936,-0.357014,-0.514282,1.153392,...,-0.170102,0.312091,1.513038,-0.424504,-0.691041,-0.361665,-1.355946,-0.330152,0.475377,-0.614569
page,-0.856372,-1.193950,1.746023,2.372608,-0.539869,-0.013681,0.137259,-2.006582,0.370486,-0.093242,...,-1.073160,0.757387,1.317964,-1.503223,-0.479901,-0.233006,1.345307,-0.674203,-0.025298,-1.222509
wikipedia,-0.072803,-1.273175,-0.512004,0.207688,0.700488,1.426814,-0.226752,-1.596084,-0.331703,1.051290,...,-1.375522,-0.475435,0.985318,-1.816379,-0.313258,0.489670,0.337506,0.850146,1.564409,-0.819561
edit,-0.115289,-0.789246,0.356960,-0.054552,-0.058556,-2.132076,2.454932,-0.994389,1.176546,0.414494,...,-1.318120,0.440471,-1.166857,-1.581744,0.433561,-1.250185,1.785059,-0.977258,0.245407,-1.130372
talk,-0.779532,0.014516,-0.139708,-0.019709,0.434057,-0.716594,1.185512,-1.172754,-0.412333,-0.618809,...,0.156723,-0.139204,2.006254,-0.787016,0.116319,-0.827861,1.539290,0.005436,0.698084,0.902387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
polemicist,-0.077954,0.036821,0.012259,-0.078886,0.050615,-0.094117,0.021782,0.105251,-0.024276,-0.106071,...,0.023562,0.057896,0.085531,0.020951,0.098502,0.144895,-0.011880,-0.018001,0.034217,0.063197
thulean,-0.034786,0.056683,-0.028608,-0.043957,0.025530,-0.060188,0.054080,0.080069,-0.084911,-0.068908,...,0.097675,0.007424,0.061246,-0.017948,0.079600,0.086349,0.071416,-0.083655,0.013160,0.001693
pasqual,-0.070468,0.028677,0.036658,-0.024234,0.089025,-0.089597,-0.016383,0.073866,-0.121169,-0.045939,...,0.023854,0.022882,-0.044565,0.054624,0.035764,0.012332,0.024106,-0.046361,-0.014754,0.054366
fitch,-0.073939,0.069619,0.012079,-0.006601,-0.038409,-0.155369,-0.002387,0.105989,-0.014934,0.006839,...,0.077091,0.036092,0.005176,-0.002897,0.095196,0.025811,0.020767,-0.086574,0.019250,0.015994


#### Explaining output

    -The output appears as datframe
    -Each row is contains single word or token
    -Each word in the vocabulary is represented as a vector in this embedding space
    -The number of dimensions in these vectors is called the embedding dimension
    -Each dimension in the vector represents a different feature or aspect of the word's meaning or context
    -The columns represents the number of dimensions which are 100 by default

## Bert Model and Evaluate

### New column label

I made new column that take the conditions i made above to easily classify data

In [110]:
# Create a new "label" column based on conditions
data['label'] = 0  # Initialize all labels to 0
data.loc[onesData, 'label'] = 1  # Set labels to 1 for the onesData
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_comment,Stemming,label
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0,"[explanation, edits, made, username, hardcore,...","[explan, edit, made, usernam, hardcor, metalli...",0
1,000103f0d9cfb60f,aww matches background colour seemingly stuck ...,0,0,0,0,0,0,"[aww, matches, background, colour, seemingly, ...","[aww, match, background, colour, seem, stuck, ...",0
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con...","[hey, man, realli, tri, edit, war, guy, consta...",0
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0,"[make, real, suggestions, improvement, wondere...","[make, real, suggest, improv, wonder, section,...",0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0,"[sir, hero, chance, remember, page]","[sir, hero, chanc, rememb, page]",0
...,...,...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0,"[second, time, asking, view, completely, contr...","[second, time, ask, view, complet, contradict,...",0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0,"[ashamed, horrible, thing, put, talk, page]","[asham, horribl, thing, put, talk, page]",0
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0,"[spitzer, umm, theres, actual, article, prosti...","[spitzer, umm, there, actual, articl, prostitu...",0
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0,"[looks, like, actually, put, speedy, first, ve...","[look, like, actual, put, speedi, first, versi...",0


bert model takes too much time to train on all data, so i took 60% of the data to put it in the model that take less time

In [123]:
# Take random sample of 60% from data
data_filtered = data.sample(frac=0.6, random_state=42)
data_filtered

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_comment,Stemming,label
47821,7fbe302a798dd727,thank answer removed tag,0,0,0,0,0,0,"[thank, answer, removed, tag]","[thank, answer, remov, tag]",0
86430,e72b3de63462923d,threatened pov warrior indian origin whos cont...,0,0,0,0,0,0,"[threatened, pov, warrior, indian, origin, who...","[threaten, pov, warrior, indian, origin, whos,...",0
91498,f4a78dcaf317b9ec,reading properly asked evidence continuing eng...,0,0,0,0,0,0,"[reading, properly, asked, evidence, continuin...","[read, proper, ask, evid, continu, engag, wp, ...",0
90654,f28ea0f7aea8d63d,seem fine realize much checked refs article bl...,0,0,0,0,0,0,"[seem, fine, realize, much, checked, refs, art...","[seem, fine, realiz, much, check, ref, articl,...",0
117814,7555ef26f3ca09a5,ashwinikalantri read whole wp title naming pol...,0,0,0,0,0,0,"[ashwinikalantri, read, whole, wp, title, nami...","[ashwinikalantri, read, whole, wp, titl, name,...",0
...,...,...,...,...,...,...,...,...,...,...,...
10568,1be230556f65c4a8,know vandalism getting messages anything,0,0,0,0,0,0,"[know, vandalism, getting, messages, anything]","[know, vandal, get, messag, anyth]",0
86123,e6646df0827d8cdc,get examples,0,0,0,0,0,0,"[get, examples]","[get, exampl]",0
36891,6282f1c1f5afcc66,clean stand reading spelling grammatical error...,0,0,0,0,0,0,"[clean, stand, reading, spelling, grammatical,...","[clean, stand, read, spell, grammat, error, etc]",0
29594,4e81809ce22e8322,ahhhh thanks rackin brain whole time singing f...,0,0,0,0,0,0,"[ahhhh, thanks, rackin, brain, whole, time, si...","[ahhhh, thank, rackin, brain, whole, time, sin...",0


### Implement model

In [124]:
# Split the data into text and labels
text = data_filtered['comment_text'].values
labels = data_filtered['label'].values
# Convert labels to binary matrix
labels = to_categorical(labels)

# Split the data into train and test sets with test size 20%
train_text, test_text, train_labels, test_labels = train_test_split(text, labels, test_size=0.2, random_state=42)

# Initialize bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize bert model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Maximum sequence length for BERT
    # Common choice for max_length is in range of 128 to 512 tokens
    # Shorter max_length values is faster so i chose 128
# Maximum sequence length for BERT
max_length = 128

# Tokenize and encode the training and test data
train_encodings = tokenizer.batch_encode_plus(train_text.tolist(), max_length=max_length, padding=True, truncation=True)
test_encodings = tokenizer.batch_encode_plus(test_text.tolist(), max_length=max_length, padding=True, truncation=True)

# Create input data for train and test
train_inputs = [np.array(train_encodings['input_ids']), np.array(train_encodings['attention_mask'])]
test_inputs = [np.array(test_encodings['input_ids']), np.array(test_encodings['attention_mask'])]

# Define input layer for input id and attention mask
input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

# Get the outputs of bert model
outputs = bert_model(input_ids, attention_mask=attention_mask)
pooled_output = outputs[1]

# Create a Dense layer for each label
output = Dense(2, activation='softmax')(pooled_output)
model = Model(inputs=[input_ids, attention_mask], outputs=output)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [126]:
# Compile each output with categorical_crossentropy
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model for 1 epoch and batch size 16
model.fit(train_inputs, train_labels, epochs=1, batch_size=16)



<keras.src.callbacks.History at 0x785b59d26740>

In [127]:
# Make predictions for each label in test data
predictions = model.predict(test_inputs)

# get the higger probabilty and put it in predicted labels
predicted_labels = np.argmax(predictions, axis=1)

# get the higger probabilty and put it in true labels
true_labels = np.argmax(test_labels, axis=1)

# print classification report
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     17196
           1       0.00      0.00      0.00      1947

    accuracy                           0.90     19143
   macro avg       0.45      0.50      0.47     19143
weighted avg       0.81      0.90      0.85     19143



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Expaining output

    -predict on test data and print loss and accuracy for label
    -classification report that shows the model's performance metrics for both classes (0 and 1)
    -The model performs well in terms of precision, recall, and F1-score for the zero class which is the appropriate comments
    -Percision, recall, and F1-score are low for class 1, which means that the model doesn't identify the inappropriate comments correctly
    -The accuracy is high but it's not able to classify the inappropriate comments