Importing Dataset

In [1]:
import pandas as pd

dataset = pd.read_csv('data/train.csv', encoding='ISO-8859-1');
test_data = pd.read_csv('data/test.csv', encoding='ISO-8859-1');
testlabels_data = pd.read_csv('data/test_labels.csv', encoding='ISO-8859-1');

dataset.drop('id', inplace=True, axis=1) #dropping id column from training data
dataset.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explination\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this bickground colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not tryin to edit war. It'...",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Ensuring Comments are in String Format

In [2]:
dataset = dataset.astype({'comment_text':'string'})
dataset.dtypes

comment_text     string
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

Coverting to Lower Case

In [3]:
dataset.comment_text = dataset.comment_text.str.lower()

dataset.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explination why the edits made under my userna...,0,0,0,0,0,0
1,d'aww! he matches this bickground colour i'm s...,0,0,0,0,0,0
2,"hey man, i'm really not tryin to edit war. it'...",0,0,0,0,0,0
3,""" more i can't make any real suggestions on ...",0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0


Removing Non-alphabetic Characters

In [4]:
import re
# comment = re.sub('[^A-Za-z]', ' ', dataset)
dataset.comment_text = dataset.comment_text.str.replace('[^a-zA-Z]', ' ', regex=True)
dataset.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explination why the edits made under my userna...,0,0,0,0,0,0
1,d aww he matches this bickground colour i m s...,0,0,0,0,0,0
2,hey man i m really not tryin to edit war it ...,0,0,0,0,0,0
3,more i can t make any real suggestions on ...,0,0,0,0,0,0
4,you sir are my hero any chance you remember...,0,0,0,0,0,0


Removing Stop Words

In [5]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

dataset['comment_text'] = dataset['comment_text'].apply(lambda x: ' '.join(item for item in x.split() if item not in stop))
#dataset['comment_text'] = dataset['comment_text'].apply(lambda y:"".join(y))
dataset = dataset.astype({'comment_text':'string'})
dataset.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explination edits made username hardcore metal...,0,0,0,0,0,0
1,aww matches bickground colour seemingly stuckk...,0,0,0,0,0,0
2,hey man really tryin edit war guy constantly r...,0,0,0,0,0,0
3,make real suggestions improvement wondered sec...,0,0,0,0,0,0
4,sir hero chance remember page,0,0,0,0,0,0


In [6]:
import numpy
dataset.dtypes

comment_text     string
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

Implemeting Spell Correction

In [7]:
# from textblob import TextBlob

from autocorrect import Speller

spell = Speller(lang='en')

dataset['comment_text'].loc[:5] = dataset['comment_text'].loc[:5].apply(lambda txt: ''.join(spell(txt)))
dataset = dataset.astype({'comment_text':'string'})
dataset.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,www matches background colour seemingly stuck ...,0,0,0,0,0,0
2,hey man really trying edit war guy constantly ...,0,0,0,0,0,0
3,make real suggestions improvement wondered sec...,0,0,0,0,0,0
4,sir hero chance remember page,0,0,0,0,0,0


In [8]:
dataset.dtypes

comment_text     string
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

Implementing Lemmitization

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import nltk

# w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

# def get_lemmatization(text):
#     # Tokenize: Split the sentence into words
#     word_list = nltk.word_tokenize(text)
#     # Lemmatize list of words and join
#     lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
#     return lemmatized_output
# dataset['comment_text'] = dataset['comment_text'].apply(get_lemmatization)

# word_list = nltk.word_tokenize(str(dataset['comment_text']))
# for w in word_list:
dataset['comment_text'] = dataset['comment_text'].apply(lambda text: ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(text)]))

dataset.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,www match background colour seemingly stuck th...,0,0,0,0,0,0
2,hey man really trying edit war guy constantly ...,0,0,0,0,0,0
3,make real suggestion improvement wondered sect...,0,0,0,0,0,0
4,sir hero chance remember page,0,0,0,0,0,0


Implementing CountVectorizer with bigrams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


CountVec = CountVectorizer(ngram_range=(3,3)) # to use bigrams ngram_range=(2,2)

totalcomments = len(dataset.comment_text)
loops = 200
batchsize = totalcomments/loops

for i in range(loops):
    start = i*batchsize
    end = i*batchsize + batchsize
    #transform
    Count_data = CountVec.fit_transform(dataset['comment_text'].loc[start:end])
    #create dataframe
    cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
    if i>0:
        cv_dataframe.loc[start:end]=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
        

cv_dataframe.head()

Feature Extraction with Tf-Idf vectorizer without smooth IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

#without smooth IDF
print("Without Smoothing:")
#define tf-idf
tf_idf_vec = TfidfVectorizer(use_idf=True, 
                        smooth_idf=False,  
                        ngram_range=(3,3)) # to use only  bigrams ngram_range=(2,2)

totalcomments = len(dataset.comment_text)
loops = 200
batchsize = totalcomments/loops

for i in range(loops):
    start = i*batchsize
    end = i*batchsize + batchsize

    #transform
    tf_idf_data = tf_idf_vec.fit_transform(dataset['comment_text'].loc[start:end])
    
    #create dataframe
    tf_idf_dataframe=pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names_out())
    if i>0:
        tf_idf_dataframe.loc[start:end]=pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names_out())

tf_idf_dataframe.head()

Without Smoothing:


Unnamed: 0,aa best handled,abandon supplement gng,abbe fecamp maitre,abc news sure,aberration dungeon dragon,abhishek removed image,abiding least well,ability judge see,abkhazia south ossetia,able cool explain,...,zionism anti semite,zionist entity agenda,zionist lie filth,zizou comment public,zizou mrazi watching,zlykinskyja either main,zombie computer prevent,zone picture like,zone planetary characteristic,zone turn first
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Feature Extraction with Tf-Idf vectorizer with smooth IDF

In [12]:
#with smooth
print("With Smoothing:")

#define tf-idf
tf_idf_vec = TfidfVectorizer(use_idf=True, 
                        smooth_idf=True,  
                        ngram_range=(3,3)) # to use only  bigrams ngram_range=(2,2)

totalcomments = len(dataset.comment_text)
loops = 50
batchsize = totalcomments/loops

for i in range(loops):
    start = i*batchsize
    end = i*batchsize + batchsize

    #transform
    tf_idf_data = tf_idf_vec.fit_transform(dataset['comment_text'].loc[start:end])
    
    #create dataframe
    tf_idf_dataframe=pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names_out())
    if i>0:
        tf_idf_dataframe.loc[start:end]=pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names_out())

tf_idf_dataframe.head()

With Smoothing:


Unnamed: 0,aa best handled,aaa mean old,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaany article create,aaron home run,ab aft wessonsuoum,ab augustine immigrant,ab canonicus sachem,abadan oil refinery,abandon concept george,abandon hope ye,...,zuckerberg allegedly used,zuckerberg called early,zuckerberg friend shortly,zuckerberg http www,zuckerberg ims confirmed,zuckerberg ims wont,zuckerberg new yorker,zuckerberg social networking,zuckerbergs hacker way,zyxw nice work
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Split train and test data

In [20]:
y = tf_idf_dataframe.iloc[:, 0]
y = y.astype('string')
# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tf_idf_dataframe, y)

Model Training

In [24]:
# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [23]:
accuracy

1.0