# Imports

In [1]:
pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting gdown>=4.0.0
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown, nlpaug
Successfully installed gdown-4.7.1 nlpaug-1.1.11
[0m

In [4]:
import re
import string
import pandas as pd
import contractions
import numpy as np
import nltk

from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from textblob import Word
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Data Augmentation
import nlpaug.augmenter.word as naw
from tqdm import tqdm
from sklearn.utils import shuffle

import gensim

In [5]:
label = 'insult'
other_labels = ['id', 'toxic', 'severe_toxic' ,'obscene', 'threat', 'insult', 'identity_hate']
other_labels.remove(label)

n_augmentation = 1000

# Dataset

In [7]:
df = pd.read_csv('/kaggle/input/toxicity-dataset/train.csv')
df.drop(other_labels, axis=1,inplace=True)

In [8]:
# Balance dataset
df_label_0 = df[df[label] == 0]
df_label_1 = df[df[label] == 1]

len_0 = len(df_label_0)
len_1 = len(df_label_1)

# Result: 24470 comments for training step
df_label_0_balanced = df_label_0.sample(n=len_1, replace=False, random_state=42)
df_balanced = pd.concat([df_label_0_balanced, df_label_1], ignore_index=True)
df_balanced = df_balanced.sample(frac=1, random_state=42)
df_balanced = df_balanced.reset_index(drop=True)

In [9]:
# Split the data into training and test sets with an 80:20 ratio
train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42)

In [10]:
train_df[train_df[label]==1].count()

comment_text    6334
insult          6334
dtype: int64

In [11]:
train_df

Unnamed: 0,comment_text,insult
2075,get fucking lost i know you are sad but keep i...,1
13806,"""\n\n""""Arguing with idiots: It's like playing ...",1
949,"For a guideline for the page, we have 2 versio...",0
9054,Check Out...=\nThe talk page for The Cheetah G...,0
10113,FFS \n\nYou sad little cock muncher.,1
...,...,...
5191,"""\n ...",1
13418,hey dickwads. stop hijacking this wiki. these ...,1
5390,"""\n I AM NOT THE """"SAN DIEGO IP""""!! Go ahead ...",0
860,"""\n\nAnybody??? It's still unsourced and STILL...",0


# Preprocess dataset

In [12]:
def clean_text(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    # emoticon_pattern = re.compile("|".join(re.escape(x) for x in emoticon_map.keys()))
    # emoji_pattern = re.compile("|".join(re.escape(x) for x in emoji_map.keys()))
    punctuation_remover = str.maketrans('', '', string.punctuation)
    digit_remover = str.maketrans('', '', string.digits)
        
    text = contractions.fix(text)  # from he's to he is 
    text = re.sub(url_pattern, '', text)  # replace all URLs in the text with an empty string
    text = text.lower()
    # text = emoticon_pattern.sub(lambda x: emoticon_map[x.group()], text)  # convert emoticons to word
    # cleaned_text = emoji_pattern.sub(lambda x: emoji_map[x.group()], text)  # convert emojis to word
    text = text.translate(punctuation_remover)  # remove punctuation from text
    text = text.translate(digit_remover)  # remove digits from text
    text = " ".join(text.strip().split())
    
    return text

In [13]:
# Stemmatization and Stop Words removal
def preprocess(text, stem = True):
    stemmer = SnowballStemmer("english")
    stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
                  'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 
                  'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those'
                  'a','an','the','and','but','same','so','than','too','very','s','t','can','will','just',
                  'if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after',
                  'above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there',
                  'when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','only','own']
    tokens = []
    stop_words = set(stopwords.words('english'))  # You can replace 'english' with the appropriate language
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [14]:
train_df.comment_text = train_df.comment_text.apply(lambda x: clean_text(x))
train_df.comment_text = train_df.comment_text.apply(lambda x: preprocess(x))

test_df.comment_text = test_df.comment_text.apply(lambda x: clean_text(x))
test_df.comment_text = test_df.comment_text.apply(lambda x: clean_text(x))

In [15]:
train_df.comment_text

2075                           get fuck lost know sad keep
13806    argu idiot like play chess pigeon matter good ...
949      guidelin page version current go around encycl...
9054     check talk page cheetah girl girl group ad som...
10113                           ffs sad littl cock muncher
                               ...                        
5191     yeah could farix know wikipedia place add info...
13418    hey dickwad stop hijack wiki final day ytmnd n...
5390     san diego ip go ahead block zero interest part...
860      anybodi still unsourc still look bit wierd way...
7270                                        check whilehow
Name: comment_text, Length: 12603, dtype: object

# TF-IDF Vectoriser

In [16]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words=None, ngram_range = (1,3))

# Vectorize dataset

In [17]:
vectorizer.fit(train_df['comment_text'])  # build the vocabulary
    
# fit the vectorizer on the text data and transform it into a sparse matrix
x_train = vectorizer.transform(train_df['comment_text'])
y_train = train_df['insult']

x_test = vectorizer.transform(test_df['comment_text'])
y_test = test_df['insult']

# SVM Classification with Grid Search

In [20]:
from sklearn.pipeline import Pipeline

svm = Pipeline([('svc', SVC())])

param_grid = {'svc__C': [0.1, 1, 10],
              'svc__kernel': ['linear', 'rbf', 'sigmoid'],
              'svc__gamma': [0.01, 0.1, 1]}

grid_search = GridSearchCV(svm, param_grid, cv=5, n_jobs=-1, verbose=10)
history = grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [22]:
best_svm = grid_search.best_estimator_
print("Accuracy on test set:", best_svm.score(x_test, y_test))

Accuracy on test set: 0.8730561726436052
              precision    recall  f1-score   support

           0       0.84      0.92      0.88      1608
           1       0.91      0.82      0.86      1543

    accuracy                           0.87      3151
   macro avg       0.88      0.87      0.87      3151
weighted avg       0.88      0.87      0.87      3151

[CV 4/5; 1/27] START svc__C=0.1, svc__gamma=0.01, svc__kernel=linear............
[CV 4/5; 1/27] END svc__C=0.1, svc__gamma=0.01, svc__kernel=linear;, score=0.892 total time=  17.0s
[CV 2/5; 2/27] START svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf...............
[CV 2/5; 2/27] END svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf;, score=0.503 total time=  33.7s
[CV 5/5; 2/27] START svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf...............
[CV 5/5; 2/27] END svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf;, score=0.502 total time=  34.7s
[CV 1/5; 4/27] START svc__C=0.1, svc__gamma=0.1, svc__kernel=linear.............
[CV 1/5; 4/27]

In [23]:
y_pred = best_svm.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      1608
           1       0.91      0.82      0.86      1543

    accuracy                           0.87      3151
   macro avg       0.88      0.87      0.87      3151
weighted avg       0.88      0.87      0.87      3151



In [24]:
from joblib import dump
dump(best_svm, 'svm_model.joblib')

['svm_model.joblib']

# Random Forest Classification with Grid Search

In [25]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

rf_classifier = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
history = grid_search.fit(x_train, y_train)

In [29]:
classifier = grid_search.best_estimator_

print("Accuracy on test set:", classifier.score(x_test, y_test))
y_pred = classifier.predict(x_test)
print(classification_report(y_test,y_pred))

Accuracy on test set: 0.8676610599809584
              precision    recall  f1-score   support

           0       0.84      0.92      0.88      1608
           1       0.90      0.82      0.86      1543

    accuracy                           0.87      3151
   macro avg       0.87      0.87      0.87      3151
weighted avg       0.87      0.87      0.87      3151



In [48]:
sentence = [clean_text("you are sooo S_T_U_P_I_D")]
sentence = vectorizer.transform(sentence)
prediction = classifier.predict(sentence)
print(prediction)

[1]


In [49]:
from joblib import dump
dump(classifier, 'random_forest_model.joblib')

['random_forest_model.joblib']

# K-NN with Grid Search

In [52]:
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([('knn', KNeighborsClassifier())])

param_grid = {
    'knn__n_neighbors': [2, 3, 5, 7, 9, 11, 13, 15]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring="accuracy", verbose=10)
history = grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [53]:
classifier = grid_search.best_estimator_
print("Accuracy on test set:", classifier.score(x_test, y_test))
y_pred = classifier.predict(x_test)
print(classification_report(y_test,y_pred))

Accuracy on test set: 0.6708981275785465
              precision    recall  f1-score   support

           0       0.61      0.95      0.75      1608
           1       0.88      0.38      0.53      1543

    accuracy                           0.67      3151
   macro avg       0.75      0.67      0.64      3151
weighted avg       0.74      0.67      0.64      3151



In [54]:
from joblib import dump
dump(classifier, 'knn_model.joblib')

['knn_model.joblib']

# Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([('logreg', LogisticRegression())])

param_grid = {
    'logreg__C': [0.1, 1.0, 10.0],
    'logreg__penalty': ['l2']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring="accuracy", verbose=10)
history = grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [58]:
classifier = grid_search.best_estimator_
print("Accuracy on test set:", classifier.score(x_test, y_test))

y_pred = classifier.predict(x_test)
print(classification_report(y_test,y_pred))

Accuracy on test set: 0.8743256109171692
              precision    recall  f1-score   support

           0       0.85      0.92      0.88      1608
           1       0.91      0.83      0.87      1543

    accuracy                           0.87      3151
   macro avg       0.88      0.87      0.87      3151
weighted avg       0.88      0.87      0.87      3151



In [59]:
from joblib import dump
dump(classifier, 'logistic_regression_model.joblib')

['logistic_regression_model.joblib']

# Naive Bayes

In [23]:
from sklearn.naive_bayes import BernoulliNB

pipeline = Pipeline([('nb', BernoulliNB())])

param_grid = {
    'nb__alpha': [0.1, 0.5, 1.0, 5.0, 7.0, 10, 15]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring="accuracy", verbose=10)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


GridSearchCV(cv=5, estimator=Pipeline(steps=[('nb', BernoulliNB())]), n_jobs=-1,
             param_grid={'nb__alpha': [0.1, 0.5, 1.0, 5.0, 7.0, 10, 15]},
             scoring='accuracy', verbose=10)

In [24]:
classifier = grid_search.best_estimator_
print("Accuracy on test set:", classifier.score(x_test, y_test))

y_pred = classifier.predict(x_test)
print(classification_report(y_test,y_pred))

Accuracy on test set: 0.6401142494446208
              precision    recall  f1-score   support

           0       0.91      0.33      0.48      1608
           1       0.58      0.97      0.72      1543

    accuracy                           0.64      3151
   macro avg       0.74      0.65      0.60      3151
weighted avg       0.75      0.64      0.60      3151

[CV 4/5; 1/3] START nb__alpha=0.1...............................................
[CV 4/5; 1/3] END ................nb__alpha=0.1;, score=0.755 total time=   0.0s
[CV 2/5; 3/3] START nb__alpha=10................................................
[CV 2/5; 3/3] END .................nb__alpha=10;, score=0.706 total time=   0.0s
[CV 1/5; 1/7] START nb__alpha=0.1...............................................
[CV 1/5; 1/7] END ................nb__alpha=0.1;, score=0.761 total time=   0.0s
[CV 5/5; 1/7] START nb__alpha=0.1...............................................
[CV 5/5; 1/7] END ................nb__alpha=0.1;, score=0.756 tot

In [61]:
from joblib import dump
dump(classifier, 'naive_bayes.joblib')

['naive_bayes.joblib']

[CV 4/5; 1/7] START knn__n_neighbors=3..........................................
[CV 4/5; 1/7] END ...........knn__n_neighbors=3;, score=0.637 total time=   1.6s
[CV 1/5; 2/7] START knn__n_neighbors=5..........................................
[CV 1/5; 2/7] END ...........knn__n_neighbors=5;, score=0.604 total time=   1.4s
[CV 5/5; 2/7] START knn__n_neighbors=5..........................................
[CV 5/5; 2/7] END ...........knn__n_neighbors=5;, score=0.627 total time=   1.4s
[CV 4/5; 3/7] START knn__n_neighbors=7..........................................
[CV 4/5; 3/7] END ...........knn__n_neighbors=7;, score=0.634 total time=   1.4s
[CV 3/5; 4/7] START knn__n_neighbors=9..........................................
[CV 3/5; 4/7] END ...........knn__n_neighbors=9;, score=0.603 total time=   1.3s
[CV 2/5; 5/7] START knn__n_neighbors=11.........................................
[CV 2/5; 5/7] END ..........knn__n_neighbors=11;, score=0.602 total time=   1.4s
[CV 2/5; 6/7] START knn__n_n

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[CV 1/5; 1/7] START knn__n_neighbors=3..........................................
[CV 1/5; 1/7] END ...........knn__n_neighbors=3;, score=0.687 total time=   1.7s
[CV 5/5; 1/7] START knn__n_neighbors=3..........................................
[CV 5/5; 1/7] END ...........knn__n_neighbors=3;, score=0.641 total time=   1.3s
[CV 4/5; 2/7] START knn__n_neighbors=5..........................................
[CV 4/5; 2/7] END ...........knn__n_neighbors=5;, score=0.637 total time=   1.4s
[CV 3/5; 3/7] START knn__n_neighbors=7..........................................
[CV 3/5; 3/7] END ...........knn__n_neighbors=7;, score=0.617 total time=   1.4s
[CV 2/5; 4/7] START knn__n_neighbors=9..........................................
[CV 2/5; 4/7] END ...........knn__n_neighbors=9;, score=0.600 total time=   1.3s
[CV 1/5; 5/7] START knn__n_neighbors=11.........................................
[CV 1/5; 5/7] END ..........knn__n_neighbors=11;, score=0.598 total time=   1.3s
[CV 5/5; 5/7] START knn__n_n

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist