# NLP on toxic comments
# Building Transformer from scratch using Pytorch

## Import Libraries

In [1]:
import pandas as pd   #For data frames, reading data, data processing, and analysis
import numpy as np   #For numerical computations
import string   # For string operations and constants.
import re   # Used for regular expression matching and operations.
from nltk.corpus import stopwords   # Import a list of common stopwords
import contractions     # For expand abbreviations
import nltk   # Import Natural language Toolkit
from nltk.tokenize import word_tokenize   # For tokenization
from nltk.stem import WordNetLemmatizer     # For lemmatization
import torch    # Torch library
import torch.nn as nn       # importing various classes and functions dor building neural networks
import torch.nn.functional as F     # provides various functions for performing operations
from sklearn.model_selection import train_test_split     # For splitting data into train and test
from torch.utils.data import TensorDataset,DataLoader  # Dataset: create dataset from tensors, DataLoader: load data from dataset
import torch.optim as optim     # Provides optimazation algorithms
from sklearn.metrics import classification_report     # For showing classfication metrics

In [2]:
# Download NLTK tokenizer data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Download NLTK Stop Words data
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Exploring data

In [5]:
# Read Data
data = pd.read_csv('C:\\Users\\LENOVO\\Desktop\\train.csv')

In [6]:
# Display Data
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [7]:
# Display each column with its number of nulls and data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


- There are no nulls
- All are int columns except columns [id, comment_text] which are object types
- Total 159571 rows and 8 columns

In [8]:
# Display int columns and their counts, mean, standard deviation, minimum, maximum,and three quantiles
data.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Display shape of data
data.shape

(159571, 8)

Data has 159571 rows and 8 columns

In [10]:
# Display types
data.dtypes

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

all data are int except [id] and [comment_text] which are object data type

In [11]:
# Check for nulls data
data.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

No nulls in the data

In [12]:
# Display number of unique values in each column
data.nunique()

id               159571
comment_text     159571
toxic                 2
severe_toxic          2
obscene               2
threat                2
insult                2
identity_hate         2
dtype: int64

All data values are unique as each id and its comment is related to only one person and the other columns are binary (0,1)

In [13]:
# Calculate the count of duplicated comments in the 'comment_text' column
duplicated_count = data['comment_text'].duplicated().sum()
print("Number of duplicated comments = ", duplicated_count)

Number of duplicated comments =  0


In [14]:
# Display number of values in column [toxic]
data['toxic'].value_counts()

toxic
0    144277
1     15294
Name: count, dtype: int64

There are 144277 people didn't write toxic comments, while the other 15294 people wrote

In [15]:
# Display number of values in column [severe toxic]
data['severe_toxic'].value_counts()

severe_toxic
0    157976
1      1595
Name: count, dtype: int64

157976 people didn't write severe toxic comments, while the other 1595 people wrote 
--> It seems to be biased towards class 0 so its unbalanced

In [16]:
# Display number of values in column [obscene]
data['obscene'].value_counts()

obscene
0    151122
1      8449
Name: count, dtype: int64

151122 people didn't write obscene comments but the other 8449 people wrote --> It seems to be biased towards class 0 so its unbalanced

In [17]:
# Display number of values in column [threat]
data['threat'].value_counts()

threat
0    159093
1       478
Name: count, dtype: int64

159093 people didn't write threat comments but the other 478 people wrote
--> It is highly biased towards class 0 so its unbalanced

In [18]:
# Display number of values in column [insult]
data['insult'].value_counts()

insult
0    151694
1      7877
Name: count, dtype: int64

151694 people didn't write insult comments but the other 7877 wrote
--> It seems to be biased towards class 0 so its unbalanced

In [19]:
# Display number of values in column [identity hate]
data['identity_hate'].value_counts()

identity_hate
0    158166
1      1405
Name: count, dtype: int64

158166 people didn't write identity hate comments but the other 1405 people wrote
--> It seems to be biased towards class 0 so its unbalanced

- From the above analsysis made with value counts for each binary column, found thet column toxic is the most balanced column compared to the others
- Also, found that number of (zeros) for each column is greater than number of (ones)
- So for reassuring this, i will split the data in two classes:

### Splitting data into appropriate and inappropriate comments for more exploring

#### Display the data which has appropriate comments

In [20]:
# Display data which has no 1 in any of the binary columns as the (zerosData)
zerosData = (data["toxic"]==0)&(data["severe_toxic"]==0)&(data["obscene"]==0)&(data["threat"]==0)&(data["insult"]==0)&(data["identity_hate"]==0)

# Put it in variable (Appropriate_comments)
Appropriate_comments = data[zerosData]

# Reset the index of the 'Appropriate_comments'
Appropriate_comments.reset_index(drop=True, inplace=True)
Appropriate_comments

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
143341,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
143342,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
143343,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
143344,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


#### Display the data which has inappropriate comments

In [21]:
# Display data which has 1 in one or more of the binary columns as the (onesData)
onesData = (data["toxic"]==1)|(data["severe_toxic"]==1)|(data["obscene"]==1)|(data["threat"]==1)|(data["insult"]==1)|(data["identity_hate"]==1)

# Put it in variable (Inappropriate_comments)
Inappropriate_comments = data[onesData]

# Reset the index of the 'Inappropriate_comments'
Inappropriate_comments.reset_index(drop=True, inplace=True)
Inappropriate_comments

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
1,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
2,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
3,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
4,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
16220,fef4cf7ba0012866,"""\n\n our previous conversation \n\nyou fuckin...",1,0,1,0,1,1
16221,ff39a2895fc3b40e,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1,0,0,0,1,0
16222,ffa33d3122b599d6,Your absurd edits \n\nYour absurd edits on gre...,1,0,1,0,1,0
16223,ffb47123b2d82762,"""\n\nHey listen don't you ever!!!! Delete my e...",1,0,0,0,1,0


#### Get the percentage of them

In [22]:
# get the number of rows from both dataframes
num_appropriate = data[zerosData].shape[0]
num_inappropriate = data[onesData].shape[0]

# divide the number by whole length of data * 100 to get percentage
Percent_appropriate_data = (num_appropriate / len(data))*100
Percent_inappropriate_data = (num_inappropriate / len(data))*100

print("Percentage of appropriate data is: ", Percent_appropriate_data)
print("Percentage of inappropriate data is: ", Percent_inappropriate_data)

Percentage of appropriate data is:  89.83211235124176
Percentage of inappropriate data is:  10.167887648758233


- approximatly 90% of comments are appropriate comments while the othe 10% are inappropriate comments
- so the data is imbalanced as it's highly biased towards class 0

## Cleaning Data

We need to deal with:
- URLs --> by removing them
- Abbreviations --> will expand them
- Special characters --> by removing them
- Numeric and Punctuation characters --> by removing them
- Stop words --> by removing them
- Remove duplicates after cleaning

### Defining patterns

In [23]:
# Define a pattern of URL
url_pattern = r'https?://\S+|www\.\S+'

# Define a pattern of special characters
pattern_spec_char = r'[^A-Za-z0-9\s]+'

# Define a pattern of numeric and punctuation characters
numeric_punctuation_pattern = r'[\d' + string.punctuation + '\n]+'

# Collect all stop words and save it to variable (stop_words)
stop_words = set(stopwords.words('english'))

### Function to count numbers of things i want to clean it from data

In [24]:
def check_numbers(data , column):

    # Count rows with URLs in the 'comment_text' column
    print("Number of rows with URLs now = ", data['comment_text'].str.contains(url_pattern, case=False, regex=True).sum())

    # Count rows with special characters in the 'comment_text' column
    print("Number of rows with special characters now = ", data[column].str.contains(pattern_spec_char).sum())

    # Count rows with numeric and punctiuations in the 'comment_text' column
    print("Number of rows with numeric and punctiuation characters now = ", data[column].str.contains(numeric_punctuation_pattern).sum())
    
    # Count rows with stop words in the 'comment_text' column
    print("Number of rows with stop words now = ", sum(any(token in stop_words for token in nltk.word_tokenize(column)) for column in data[column]))
    

### Function to clean data

In [25]:
def clean_data(data, column):

    # Convert text to lower case
    data[column] = data[column].str.lower()

    # Remove the url by replacing it with empty string
    data[column] = data[column].str.replace(url_pattern, '', case=False, regex=True)

    # Apply function Expand contractions
    data[column] = data[column].apply(lambda x: contractions.fix(x))

    # Remove special characters 
    data[column] = data[column].apply(lambda x: re.sub(pattern_spec_char, '', x))

    # Remove the numeric and punctiuational characters 
    data[column] = data[column].apply(lambda x: re.sub(numeric_punctuation_pattern, ' ', x))
 
    # Tokenizing the text first, then remove stop words using list fo stop words provide above
    # Then join the cleaned text back into a string
    data[column] = data[column].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

    return data

### Apply functions

In [26]:
check_numbers(data,'comment_text')

Number of rows with URLs now =  5114
Number of rows with special characters now =  155146
Number of rows with numeric and punctiuation characters now =  156447
Number of rows with stop words now =  152520


In [27]:
clean_data(data,'comment_text')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww matches background colour seemingly stuck...,0,0,0,0,0,0
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm actual article prostitution ring c...,0,0,0,0,0,0
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0


In [28]:
# To check if things removed successfully
check_numbers(data,'comment_text')

Number of rows with URLs now =  0
Number of rows with special characters now =  0
Number of rows with numeric and punctiuation characters now =  0
Number of rows with stop words now =  0


### Check for duplicates after cleaning and drop them

In [29]:
# Calculate the count of duplicated comments in the 'comment_text' column
duplicated_count = data['comment_text'].duplicated().sum()
print("Number of duplicated comments after cleaning = ", duplicated_count)

Number of duplicated comments after cleaning =  1897


In [30]:
# Drop the duplicates
data.drop_duplicates(subset='comment_text', inplace=True)

# Reset index
data.reset_index(drop=True)
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww matches background colour seemingly stuck...,0,0,0,0,0,0
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm actual article prostitution ring c...,0,0,0,0,0,0
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0


In [31]:
duplicated_count = data['comment_text'].duplicated().sum()
duplicated_count

0

    Data now is cleaned, free from duplicates, URLs, stop words, and special, numeric, and punctuation characters

## Tokenize comment_text

In [32]:
# Tokenize data using function word_tokenize
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

In [33]:
# Create new column in the data that have the tokenized comment

# Apply tokenize_text to the original 'comment_text'
data['tokenized_comment'] = data['comment_text'].apply(tokenize_text)
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_comment
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0,"[explanation, edits, made, username, hardcore,..."
1,000103f0d9cfb60f,daww matches background colour seemingly stuck...,0,0,0,0,0,0,"[daww, matches, background, colour, seemingly,..."
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con..."
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0,"[make, real, suggestions, improvement, wondere..."
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0,"[sir, hero, chance, remember, page]"
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0,"[second, time, asking, view, completely, contr..."
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0,"[ashamed, horrible, thing, put, talk, page]"
159568,ffee36eab5c267c9,spitzer umm actual article prostitution ring c...,0,0,0,0,0,0,"[spitzer, umm, actual, article, prostitution, ..."
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0,"[looks, like, actually, put, speedy, first, ve..."


A column appeard with list of tokenized words for each row in [comment_text]

## Lemmatize tokenized comment

- Lemmatization is the process of reducing words to their base form (lemma) 
- Lemmatization preserves the semantic meaning of words which we must consider in further analysis and modeling

In [34]:
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize a list of words
def lemmatize_words(tokenized_comment):
    return [lemmatizer.lemmatize(word) for word in tokenized_comment]

In [35]:
# Apply lemmatization to the 'tokenized_comment' column
data['lemmatized_comment'] = (data['tokenized_comment'].apply(lemmatize_words)).apply(lambda tokens: ' '.join(tokens))
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_comment,lemmatized_comment
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0,"[explanation, edits, made, username, hardcore,...",explanation edits made username hardcore metal...
1,000103f0d9cfb60f,daww matches background colour seemingly stuck...,0,0,0,0,0,0,"[daww, matches, background, colour, seemingly,...",daww match background colour seemingly stuck t...
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con...",hey man really trying edit war guy constantly ...
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0,"[make, real, suggestions, improvement, wondere...",make real suggestion improvement wondered sect...
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0,"[sir, hero, chance, remember, page]",sir hero chance remember page
...,...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0,"[second, time, asking, view, completely, contr...",second time asking view completely contradicts...
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0,"[ashamed, horrible, thing, put, talk, page]",ashamed horrible thing put talk page
159568,ffee36eab5c267c9,spitzer umm actual article prostitution ring c...,0,0,0,0,0,0,"[spitzer, umm, actual, article, prostitution, ...",spitzer umm actual article prostitution ring c...
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0,"[looks, like, actually, put, speedy, first, ve...",look like actually put speedy first version de...


Column appered with the lemmatized version of the tokenized comment column 

### Check for duplicates after lemmatization and drop them

In [36]:
# Calculate the count of duplicated comments in the 'lemmatized_comment' column
duplicated_count = data['lemmatized_comment'].duplicated().sum()
print("Number of duplicated comments after lemmatization = ", duplicated_count)

Number of duplicated comments after lemmatization =  37


In [37]:
# Drop duplicates
data.drop_duplicates(subset='lemmatized_comment', inplace=True)
data.reset_index(drop = True)
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_comment,lemmatized_comment
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0,"[explanation, edits, made, username, hardcore,...",explanation edits made username hardcore metal...
1,000103f0d9cfb60f,daww matches background colour seemingly stuck...,0,0,0,0,0,0,"[daww, matches, background, colour, seemingly,...",daww match background colour seemingly stuck t...
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con...",hey man really trying edit war guy constantly ...
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0,"[make, real, suggestions, improvement, wondere...",make real suggestion improvement wondered sect...
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0,"[sir, hero, chance, remember, page]",sir hero chance remember page
...,...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0,"[second, time, asking, view, completely, contr...",second time asking view completely contradicts...
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0,"[ashamed, horrible, thing, put, talk, page]",ashamed horrible thing put talk page
159568,ffee36eab5c267c9,spitzer umm actual article prostitution ring c...,0,0,0,0,0,0,"[spitzer, umm, actual, article, prostitution, ...",spitzer umm actual article prostitution ring c...
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0,"[looks, like, actually, put, speedy, first, ve...",look like actually put speedy first version de...


## Create Vocabulary

The primary goal of creating a vocabulary is to map words in column ['comment_text'] to unique numerical indices to allow the model to work with numerical data

In [38]:
def create_vocab(data, column):
    
    # Create a dictionary that maps words to their indices and make '<PAD>' mapped to 0
    word_to_index = {'<PAD>': 0}
    
    # Assign word in the order they appear in the data
    idx = 1

    # Iterate through each comment in the column
    for comment in data[column]:

        # Split the comment into words
        words = comment.split()

        # Iterate through each word in the comment
        for word in words:

            # If the word is not found in (word_to_index), it will be added with a unique index
            if word not in word_to_index:
                word_to_index[word] = idx

                # After the word is added, the index increase by 1 to have a new word 
                idx += 1
    
    # Create a dictionary that maps indices back to words
    index_to_word = {index: word for word, index in word_to_index.items()}
    
    return word_to_index, index_to_word

In [39]:
# Call the function create_vocab()
word_to_index, index_to_word = create_vocab(data,'lemmatized_comment')

# Get the size of the vocabulary
vocab_size = len(word_to_index)

print('Vocabulary size = ',vocab_size)
word_to_index

Vocabulary size =  204909


{'<PAD>': 0,
 'explanation': 1,
 'edits': 2,
 'made': 3,
 'username': 4,
 'hardcore': 5,
 'metallica': 6,
 'fan': 7,
 'reverted': 8,
 'vandalism': 9,
 'closure': 10,
 'gas': 11,
 'voted': 12,
 'new': 13,
 'york': 14,
 'doll': 15,
 'fac': 16,
 'please': 17,
 'remove': 18,
 'template': 19,
 'talk': 20,
 'page': 21,
 'since': 22,
 'retired': 23,
 'daww': 24,
 'match': 25,
 'background': 26,
 'colour': 27,
 'seemingly': 28,
 'stuck': 29,
 'thanks': 30,
 'january': 31,
 'utc': 32,
 'hey': 33,
 'man': 34,
 'really': 35,
 'trying': 36,
 'edit': 37,
 'war': 38,
 'guy': 39,
 'constantly': 40,
 'removing': 41,
 'relevant': 42,
 'information': 43,
 'talking': 44,
 'instead': 45,
 'seems': 46,
 'care': 47,
 'formatting': 48,
 'actual': 49,
 'info': 50,
 'make': 51,
 'real': 52,
 'suggestion': 53,
 'improvement': 54,
 'wondered': 55,
 'section': 56,
 'statistic': 57,
 'later': 58,
 'subsection': 59,
 'type': 60,
 'accident': 61,
 'think': 62,
 'reference': 63,
 'may': 64,
 'need': 65,
 'tidying': 6

- Local dictionary is made that have the unique words of data corresponding to their indices (by row)
- word to index dictionary that have the word as the index and corresponds to its unique number 
- index to word dictionary that have the index as the unique numbers correspond to their words
- The vocabulary has 204909 words

## Padding

The padding function is designed to prepare the lemmatized comment text to get into the transformer model

### calculate the mean of the max word counts of all rows 

this let us know the maximum sequence length the padding shoulf have to cover most of the sequence in text 

In [40]:
# Define a function to count the number of words in a text
def count_words(text):
    words = text.split()
    return len(words)

# Calculate the maximum word count in each row and store it in a list
max_word_counts = data['lemmatized_comment'].apply(count_words)

# Calculate the mean of the maximum word counts of all rows
mean_max_word_count = max_word_counts.mean()

print("Mean of the maximum word counts of all rows: ",mean_max_word_count)


Mean of the maximum word counts of all rows:  33.57228315687307


Let's take 50 as a maximum sequence length to cover a good sequence of the text

In [41]:
def padding(data,column, word_to_index, max_seq_length=50):

    # Create an empty list to store sequences
    sequences = []

    # Iterate through each comment in the column
    for comment in data[column]:

        # Split the comment into words
        words = comment.split()

        # Convert words to their corresponding indices using the word-to-index dictionary made before
        sequence = [word_to_index.get(word, word_to_index["<PAD>"]) for word in words]

        # Pad the sequence with '<PAD>' indices to reach the maximum sequence length mentioned (50)
        sequence = sequence[:max_seq_length]
        sequence.extend([word_to_index["<PAD>"]] * (max_seq_length - len(sequence)))

        # Append the sequence to the list of sequences (The empty list i created above)
        sequences.append(sequence)

    # Convert the list of sequences array
    padded_sequences = np.array(sequences)
    
    return padded_sequences

In [42]:
# Call the function padding()
padded_sequences = padding(data,'lemmatized_comment', word_to_index)

# Create a new column called ('Padding') and store padded_sequences in it
data['Padding'] = list(padded_sequences)
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_comment,lemmatized_comment,Padding
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0,"[explanation, edits, made, username, hardcore,...",explanation edits made username hardcore metal...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,000103f0d9cfb60f,daww matches background colour seemingly stuck...,0,0,0,0,0,0,"[daww, matches, background, colour, seemingly,...",daww match background colour seemingly stuck t...,"[24, 25, 26, 27, 28, 29, 30, 20, 31, 32, 0, 0,..."
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con...",hey man really trying edit war guy constantly ...,"[33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 4..."
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0,"[make, real, suggestions, improvement, wondere...",make real suggestion improvement wondered sect...,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6..."
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0,"[sir, hero, chance, remember, page]",sir hero chance remember page,"[92, 93, 94, 95, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
...,...,...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0,"[second, time, asking, view, completely, contr...",second time asking view completely contradicts...,"[1204, 605, 2774, 469, 1103, 10165, 6301, 1153..."
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0,"[ashamed, horrible, thing, put, talk, page]",ashamed horrible thing put talk page,"[8172, 2702, 586, 990, 20, 21, 0, 0, 0, 0, 0, ..."
159568,ffee36eab5c267c9,spitzer umm actual article prostitution ring c...,0,0,0,0,0,0,"[spitzer, umm, actual, article, prostitution, ...",spitzer umm actual article prostitution ring c...,"[89207, 10570, 49, 82, 7284, 1891, 24476, 3595..."
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0,"[looks, like, actually, put, speedy, first, ve...",look like actually put speedy first version de...,"[276, 277, 760, 990, 182, 74, 487, 178, 276, 0..."


The padding function converts the lemmatized comment text into numerical sequences with word-to-index (got from the vocabulary made) and sequence length (50) and return this sequence as array for each row

## New column label

I made new column that take the conditions i made above to easily classify data

I made this to convert this problem to a binary classification label to solve the imbalcing of the data

In [43]:
# Create a new "label" column based on conditions
data['label'] = 0  # Initialize all labels to 0
data.loc[onesData, 'label'] = 1  # Set labels to 1 for the onesData

# Drop the other labels as they are not important now
data.drop(['toxic','severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
data

Unnamed: 0,id,comment_text,tokenized_comment,lemmatized_comment,Padding,label
0,0000997932d777bf,explanation edits made username hardcore metal...,"[explanation, edits, made, username, hardcore,...",explanation edits made username hardcore metal...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0
1,000103f0d9cfb60f,daww matches background colour seemingly stuck...,"[daww, matches, background, colour, seemingly,...",daww match background colour seemingly stuck t...,"[24, 25, 26, 27, 28, 29, 30, 20, 31, 32, 0, 0,...",0
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,"[hey, man, really, trying, edit, war, guy, con...",hey man really trying edit war guy constantly ...,"[33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 4...",0
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,"[make, real, suggestions, improvement, wondere...",make real suggestion improvement wondered sect...,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...",0
4,0001d958c54c6e35,sir hero chance remember page,"[sir, hero, chance, remember, page]",sir hero chance remember page,"[92, 93, 94, 95, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0
...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,"[second, time, asking, view, completely, contr...",second time asking view completely contradicts...,"[1204, 605, 2774, 469, 1103, 10165, 6301, 1153...",0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,"[ashamed, horrible, thing, put, talk, page]",ashamed horrible thing put talk page,"[8172, 2702, 586, 990, 20, 21, 0, 0, 0, 0, 0, ...",0
159568,ffee36eab5c267c9,spitzer umm actual article prostitution ring c...,"[spitzer, umm, actual, article, prostitution, ...",spitzer umm actual article prostitution ring c...,"[89207, 10570, 49, 82, 7284, 1891, 24476, 3595...",0
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,"[looks, like, actually, put, speedy, first, ve...",look like actually put speedy first version de...,"[276, 277, 760, 990, 182, 74, 487, 178, 276, 0...",0


        Now data is cleaned and prepared to enter the transformer

## Transformer

Using pytorch, the steps are:
- Define the basic building blocks: Multi-Head Attention, Position-wise Feed-Forward Networks, Positional Encoding
- Build the Encoder layer
- Create a complete Transformer model
- Split data into train and test
- Train the model
- Test the model

Since it's a classification model, we don't need a decoder layer

A decoder is in sequence-to-sequence models or autoencoders for tasks like machine translation or text generation, that's why i won't use it

I'll use only the encoder layer as i'm directly mapping input data to the label

### Multi-Head Attention

MultiHeadAttention mechanism is a fundamental building block of Transformer model

The input sequence is linearly transformed into three separate sets of vectors (keys, queries, and values) for each attention head

Each attention head measures the similarity between the queries and keys and computes a set of attention weights 

These weights determine how much each position in the input should contribute to the output

The outputs of all attention heads are then combined to produce the final output, which called [ weighted sum of the values ]

In [44]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()

        # Initialize the MultiHeadAttention layer
        self.d_model = d_model      # Dimensionality of input data
        self.n_heads = n_heads      # Number of attention heads
        self.head_dim = d_model // n_heads      # Dimensionality of each attention head

        # Linear transformations for Key, Query, Value, and the Output
        self.W_K = nn.Linear(d_model, d_model)
        self.W_Q = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)
        self.W_O = nn.Linear(d_model, d_model)

    def forward(self, Q, K, V, mask=None):
        
        # Get the batch size from the input tensor
        batch_size = Q.shape[0]

        # Project the input Key, Query, and Value tensors using the linear transformation
        K = self.W_K(K)
        Q = self.W_Q(Q)
        V = self.W_V(V)

        # Reshape the projected tensors to support multi-head attention
        K = K.view(batch_size, -1, self.n_heads, self.head_dim)
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim)

        # Compute the dot product of Q and K for each head
            # bqhd, bkhd (for performing tensor contractions):
                # b: batch size
                # q: query dimentions 
                # h: number of attention heads
                # d: dimention of each head
                # k: key dimentions
            # bhqk (the output tensor):
                # b: batch size (the same)
                # h: number of attention heads
                # q: query dimentions (swapped with k)
                # k: key dimentions (swapped with q)
        QK = torch.einsum('bqhd,bkhd->bhqk', Q, K)

        # Scale the dot products by the square root of the head dimension
        scores = QK / torch.sqrt(torch.tensor(self.head_dim).float())

        # Apply mask to the attention scores to mask out certain values 
        if mask is not None:
            # Expand the mask dimensions to match the shape of the attention scores
            mask = mask.unsqueeze(1).unsqueeze(2)
         # Set certain elements of attention scores to a low value (-1e9) and ignore them in computations
            scores.masked_fill_(mask == 0, -1e9)

        # Compute attention weights using softmax
        attention_weights = F.softmax(scores, dim=-1)

        # Calculates the weighted sum of values based on the attention scores for each head 
                # First input tensor (bhqv):
                    # b: batch size
                    # h: number of attetion heads
                    # q: query dimentions
                    # v: value dimentions
                # Second input tensor (bvhd):
                    # b: batch size
                    # v: value dimentions
                    # h: number of attention heads
                    # d: dimention of each attention head
                # Output tensor (bqhd):
                    # b: batch size
                    # q: query dimentions
                    # h: number of attention heads
                    # d: dimention of each head
        out = torch.einsum('bhqv,bvhd->bqhd', attention_weights, V)

        # Reshape the output 
        out = out.reshape(batch_size, -1, self.d_model)

        # Put the output in the output linear transformation
        out = self.W_O(out)

        # Return the final output
        return out

### Position-wise Feed-Forward Networks

 This class introduces changes in dimensions to each position's representation in the input sequence
 
 It allows the model to capture complex relationships within the data while maintaining the overall structure of the original representation

 The feedforward neural network has 2 fully connected layers, the input go through the first layer through RELU activation function then pass through the second layer to produce the final output

In [45]:
class FeedForward(nn.Module):

    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        
        # Initialize the first fully connected layer 
        # Input size is the hidden stated dimentions (d_model) 
        # Output size is the feed forward dimentions (d_ff)
        self.fc1 = nn.Linear(d_model, d_ff)
        
        # Initialize the second fully connected layer 
        # Input size is the feed forward dimentions (d_model) 
        # Output size is the hidden stated dimentions (d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        
        # Apply ReLU activation function to the first layer output
        x = F.relu(self.fc1(x))
        
        # move the output of the first layer to the second layer
        x = self.fc2(x)
        
        # Return the final output
        return x

### Positional Encoding

This class adds positional information to the input data in a way that the transformer model can use, enabling it to capture sequential patterns and relationships when processing data

In [46]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_seq_len):
        super(PositionalEncoding, self).__init__()
        
        # Initialize a zeros positional encoding matrix of shape (max_seq_len, d_model)  
        self.pe = torch.zeros(max_seq_len, d_model)
        
        # Create a tensor containing values from 0 to max_seq_len - 1
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        
        # Calculate exponents and put it in tensor to create sinusoidal positional encodings
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-np.log(10000.0) / d_model))
        
        # Calculate sine and cosine positional encodings 
        self.pe[:, 0::2] = torch.sin(position * div_term) 
        self.pe[:, 1::2] = torch.cos(position * div_term) 
        
        # Add an additional dimension to make it suitable with the model input
        self.pe = self.pe.unsqueeze(0)

    def forward(self, x):
        
        # Add the positional encodings to the input x
        return x + self.pe[:, :x.size(1)]

### Encoder Layer

The Encoder Layer consists of a self-attention mechanism with multiple heads, a feed-forward neural network, and layer normalization after each operation

The Transformer Encoder stacks multiple Encoder Layers to create the encoder, making it suitable to the sequence modeling

In [47]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, n_heads, d_ff):
        super(EncoderLayer, self).__init__()
        
        # Self-attention mechanism with multiple heads
        self.self_attention = MultiHeadAttention(d_model, n_heads)
        
        # Feed-forward neural network
        self.feed_forward = FeedForward(d_model, d_ff)
        
        # Layer normalization after self-attention
        self.norm1 = nn.LayerNorm(d_model)
        
        # Layer normalization after feed-forward
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        
        # Multi-head self-attention operation
        attention_output = self.self_attention(x, x, x, mask)
        
        # Residual connection and layer normalization
        x = x + attention_output
        x = self.norm1(x)
        
        # Feed-forward neural network operation
        feed_forward_output = self.feed_forward(x)
        
        # Residual connection and layer normalization
        x = x + feed_forward_output
        x = self.norm2(x)
        
        # Return the final output 
        return x

In [48]:
class TransformerEncoder(nn.Module):

    def __init__(self, num_layers, d_model, n_heads, d_ff):
        super(TransformerEncoder, self).__init__()

        # Stacking multiple encoder layers to create the encoder
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff) for _ in range(num_layers)])
    
    def forward(self, x, mask):
        for layer in self.layers:
            
            # Forward passes the input through each Encoder Layer
            x = layer(x, mask)
        
        # Return the final output
        return x


### Complete Transformer Model

This class combines all the above building blocks to create the full transformer architecture 

In [49]:
class Transformer(nn.Module):

    def __init__(self, num_layers, d_model, n_heads, d_ff, input_vocab_size, max_seq_len, num_classes):
        super(Transformer, self).__init__()

        # Embedding layer to convert input tokens to dense vector
        self.embedding = nn.Embedding(input_vocab_size, d_model)

        # Positional encoding to provide positional information to the model
        self.positional_encoding = PositionalEncoding(d_model, max_seq_len)

        # Stacking multiple encoder layers to create the encoder
        self.encoder = TransformerEncoder(num_layers, d_model, n_heads, d_ff)

        # Fully connected layer for classification
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x, mask):
        
        # Embedding input tokens
        x = self.embedding(x)

        # Add positional encoding to the embedded tokens
        x = self.positional_encoding(x)

        # Embedded tokens pass through the encoder
        x = self.encoder(x, mask)

        # Global average pooling to obtain a fixed-size representation of the sequence
        x = x.mean(dim=1)  

        # The fixed-size representation pass through the fully connected layer for classification
        x = self.fc(x)

        # Return the final output
        return x

Now the Transformer architecture is completed and ready to implement it for train and test

## Train-Test split

In [50]:
# Converts array padded_sequences to a pyTorch tensor X and the .long() method is used to ensure that the tensor contains integer values
X = torch.from_numpy(padded_sequences).long()

# This line converts the values of column label (which is an array) to a pytorch tensor y and the .long() method is used to ensure that the tensor contains integer values
y = torch.from_numpy(data['label'].values).long()  


# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Number of samples in each batch
batch_size = 64

# Create DataLoader for training, validation, and testing
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Define hyperparameters - Implement the model, loss and optimizer

In [55]:
# After hyperparameter tuning, these are the best hyperparameters i reached to give the best accuracy in train and test for the model

num_layers = 6
d_model = 512
n_heads = 8
d_ff = 2048
input_vocab_size = vocab_size  
max_seq_len = 50    # Same as the max_seq_len of padding
num_classes = 2     # As it is binary classification
learning_rate = 0.0001
patience = 3        # Number of times the model will continue training if there is no improvement

In [56]:
# Create an instance of the transformer model
model = Transformer(num_layers, d_model, n_heads, d_ff, input_vocab_size, max_seq_len, num_classes)

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Initialize the optinmizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate) 

## Train Model

### Define the train function

In [57]:
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs, patience):

    # train model
    model.train()
    
    # Initialize the best validation loss to a large value (positive infinity)
    best_valid_loss = float('inf')  

    # Initialize a counter for consecutive epochs with no improvement
    consecutive_no_improvement = 0  
    
    for epoch in range(num_epochs):

        # Initialize the loss, correct predictions, and total samples in each epoch
        running_loss = 0.0
        correct_predictions = 0
        total_samples = 0
        
        for batch_idx, (inputs, targets) in enumerate(train_loader, 1):

            # Initialize gradients to zero
            optimizer.zero_grad()

            # Create mask to ignore padding tokens 
            mask = (inputs != 0)

            # Pass inputs and mask through the model to predict the output
            outputs = model(inputs, mask)

            # Calculate loss between the predicted output and the target
            loss = criterion(outputs, targets)

            # Backpropagate gradients
            loss.backward()

            # Update the parameters in the model
            optimizer.step()
            
            # Add the loss
            running_loss += loss.item()
            
            # Get the predicted labels
            _, predicted = torch.max(outputs, 1)

            # Calculate the correct predictions
            correct_predictions += (predicted == targets).sum().item()

            # Count the total samples in the batch
            total_samples += targets.size(0)
            
            print(f"\rEpoch {epoch + 1}/{num_epochs}", end='', flush=True)
        
        # Calculate epoch loss and accuracy
        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = 100 * correct_predictions / total_samples
        
    ## Validation
        # Evaluate model
        model.eval()

        # Initialize the validation loss, the correct predictions, and total samples
        valid_loss = 0.0
        valid_correct = 0
        valid_total = 0
        
        # Disable gradient calculation
        with torch.no_grad():
            for valid_inputs, valid_targets in val_loader:

                # Create mask
                valid_mask = (valid_inputs != 0)

                # Pass validation inputs and mask through the model to predict the output
                valid_outputs = model(valid_inputs, valid_mask)

                # Calculate loss between the predicted output and the target
                valid_loss_batch = criterion(valid_outputs, valid_targets)

                # add the loss
                valid_loss += valid_loss_batch.item()
                
                # Get the predicted labels
                _, valid_predicted = torch.max(valid_outputs, 1)

                # Calculate the correct predictions
                valid_correct += (valid_predicted == valid_targets).sum().item()

                # Count the total samples in the batch
                valid_total += valid_targets.size(0)

        # Calculate epoch loss and accuracy for validation
        valid_accuracy = 100 * valid_correct / valid_total
        avg_valid_loss = valid_loss / len(val_loader)
        
        print(f"\rEpoch {epoch + 1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Train Accuracy: {epoch_accuracy:.2f}%, " \
              f"Valid Loss: {avg_valid_loss:.4f}, Valid Accuracy: {valid_accuracy:.2f}%")
        
        # Check if the validation loss has improved

        # If the average validation loss is better than the current best validation loss, the best validation loss is updated
        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            consecutive_no_improvement = 0      # the consecutive no improvent is reset to 0 as it improved
        else:
            consecutive_no_improvement += 1     # Add the counter of no improvements by 1
        
        # If there is no improvement for 'patience' consecutive epochs, stop training
        if consecutive_no_improvement >= patience:
            print("-----------\nNo improvements in validation loss\nTraining Stopped")
            break
        
        # Train model again
        model.train()

### Train

In [58]:
# Call the train function with 10 epochs
num_epochs = 10
train(model, train_loader, val_loader, criterion, optimizer, num_epochs, patience)

Epoch 1/10, Train Loss: 0.1755, Train Accuracy: 94.44%, Valid Loss: 0.1503, Valid Accuracy: 94.99%
Epoch 2/10, Train Loss: 0.1158, Train Accuracy: 95.95%, Valid Loss: 0.1395, Valid Accuracy: 95.20%
Epoch 3/10, Train Loss: 0.0808, Train Accuracy: 97.08%, Valid Loss: 0.1481, Valid Accuracy: 95.17%
Epoch 4/10, Train Loss: 0.0472, Train Accuracy: 98.28%, Valid Loss: 0.1645, Valid Accuracy: 94.86%
Epoch 5/10, Train Loss: 0.0282, Train Accuracy: 99.03%, Valid Loss: 0.1949, Valid Accuracy: 94.72%
-----------
No improvements in validation loss
Training Stopped


In epoch 2, both the training loss and validation loss decreased, indicating that the model is improving its fit to the data

Training accuracy increased, showing that the model is learning more from the training data

Validation accuracy also increased, indicating better generalization.

The model acchieved highest training accuracy at epoch 5 but the validation loss increased from epoch 3 till epoch 5 and the valid accuracy decreased slightly in epoch 5

That's why i made early stopping, i keep the modell running with no improvements in validation loss for 3 epochs incase it improves but it stopped to not cause overfitting

The model achieved its best validation accuracy in epoch 2


## Test the model

### Define the test function

In [59]:
def test(model, test_loader, criterion):

    # Make the model evaluate
    model.eval()

    # Initialize the correct predictions, total samples, and total loss
    correct = 0
    total = 0
    total_loss = 0.0

    # Initiatlize 2 lists of all the predicted and all the true values
    all_predicted = []
    all_targets = []

    # Disable gradient computation
    with torch.no_grad():
        for inputs, targets in test_loader:

            # Create a binary mask: True values are non-padding tokens, and False values are padding tokens
            # Ignore padding tokens when calculating loss
            mask = (inputs != 0)

            # Pass the model the input data to make predictions
            outputs = model(inputs, mask)

            # Calculate the loss between model predictions and actual labels
            loss = criterion(outputs, targets)

            # Update the total loss
            total_loss += loss.item()

            # Find the class with the highest prediction for each sample in the batch
            _, predicted = torch.max(outputs, 1)

            # Update the number of samples processed
            total += targets.size(0)

            # Count the correct predictions by comparing the predictions with the actual labels
            correct += (predicted == targets).sum().item()

            # Append predicted and true labels for classification report
            all_predicted.extend(predicted.tolist())
            all_targets.extend(targets.tolist())

    # Calculate the accuracy
    accuracy = 100 * correct / total

    # Calculate the average loss
    average_loss = total_loss / len(test_loader)

    print(f"Test Loss: {average_loss:.4f}, Test Accuracy: {accuracy:.2f}%")

    # Generate and print the classification report

    report = classification_report(all_targets, all_predicted)
    print("\n" + report)

### Test

In [60]:
# Call the test function
test(model, test_loader, criterion)

Test Loss: 0.1910, Test Accuracy: 94.90%

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     28249
           1       0.76      0.74      0.75      3279

    accuracy                           0.95     31528
   macro avg       0.87      0.86      0.86     31528
weighted avg       0.95      0.95      0.95     31528



The model shows great performance with high test accuracy, indicating that it can correctly classify with 94.90% accuracy

For class 0, the model has high precision, recall, and F1-score, suggesting it performs well in correctly classifying this class

For class 1, the model's performance is slightly lower, with lower precision, recall, and F1-score values, indicating that it may have some difficulty in correctly classifying this class. But the F1-score (0.75) for this class suggests a reasonable balance between precision and recall

The model over all shows that it's good for correctly classyfing classes 0 and 1