In [1]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import string
import nltk

In [2]:
df = pd.read_csv("../data/train.csv")
print(df.shape)
print(df.dtypes)

(159571, 8)
id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object


In [3]:
comment = df["comment_text"]

_label = df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]
_label.head(5)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [4]:
label = _label.to_numpy()

In [5]:
labels = []
comments = []
multi_labels = []

for i in range(comment.shape[0]):
    if len(comment[i]) < 600:
        is_toxic = 1 if 1 in label[i] else 0
        labels.append(is_toxic)
        multi_labels.append(label[i])
        comments.append(comment[i])

labels = np.asarray(labels)
multi_labels = np.asarray(multi_labels)
# comments = np.asarray(comments)
print(len(comments))
new_df = pd.DataFrame(list(zip(comments, labels)), columns=["comments", "is_toxic"])
new_df.head(10)

132327


Unnamed: 0,comments,is_toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"You, sir, are my hero. Any chance you remember...",0
4,"""\n\nCongratulations from me as well, use the ...",0
5,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
6,Your vandalism to the Matt Shirvington article...,0
7,Sorry if the word 'nonsense' was offensive to ...,0
8,alignment on this subject and which are contra...,0
9,bbq \n\nbe a man and lets discuss it-maybe ove...,0


In [6]:
# punctuations
punctuations = string.punctuation.replace("\n", "") + "0123456789"
print(string.punctuation.replace("\n", "") + "0123456789")

translation_table = str.maketrans(punctuations, " " * len(punctuations))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~0123456789


In [7]:
from nltk.stem import PorterStemmer, WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# caring
# lem -> care
# stem -> car

for i in range(len(comments)):
    comments[i] = comments[i].lower().translate(translation_table)

    word_arr = []
    for word in comments[i].split(" "):
        word_arr.append(stemmer.stem(lemmatizer.lemmatize(word, pos="v")))
    comments[i] = re.sub("\W+", " ", " ".join(word_arr))

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
vectorizer = TfidfVectorizer(stop_words='english')
tf = vectorizer.fit_transform(comments)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(tf, multi_labels, test_size=0.25, random_state=0)

In [13]:
from sklearn.svm import SVC
from skmultilearn.problem_transform import BinaryRelevance

In [14]:
print(y_train[0:10])

[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [15]:
classifier = BinaryRelevance(classifier=SVC(), require_dense=[False, True])

In [16]:
classifier.fit(X_train, y_train)

BinaryRelevance(classifier=SVC(), require_dense=[False, True])

In [22]:
from sklearn.metrics import multilabel_confusion_matrix

In [19]:
y_pred = classifier.predict(X_test)

In [23]:
conf_mat = multilabel_confusion_matrix(y_test, y_pred)

In [24]:
conf_mat

array([[[29427,   189],
        [ 1306,  2160]],

       [[32722,    25],
        [  299,    36]],

       [[31048,   129],
        [  585,  1320]],

       [[32964,     1],
        [  114,     3]],

       [[31075,   232],
        [  783,   992]],

       [[32743,    15],
        [  276,    48]]], dtype=int64)

In [49]:
labal_arr = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

print(f"                  recall    precision   f1 score")
# print("Recall      Precision   f1")
for i in range(len(conf_mat)):
    recall = conf_mat[i][1][1] / (conf_mat[i][1][1] + conf_mat[i][0][1])
    precision = conf_mat[i][1][1] / (conf_mat[i][1][1] + conf_mat[i][1][0])
    f1 = 2 * (precision * recall) / (precision + recall)
    print(labal_arr[i].ljust(15, " "), str(round(recall, 3)).center(10," "), str(round(precision, 3)).center(10, " "), str(round(f1, 3)).center(12, " "))
    # print(f"{round(recall, 3)}       {round(precision, 3)}           {round(f1, 3)}")
    # print(f"{round(recall, 3)}       {round(precision, 3)}    {round(f1, 3)}")

# print(f1_score(y_test, y_pred))
# print(f1_score(y_test, y_pred))

                  recall    precision   f1 score
toxic              0.92      0.623       0.743    
severe_toxic       0.59      0.107       0.182    
obscene           0.911      0.693       0.787    
threat             0.75      0.026        0.05    
insult             0.81      0.559       0.662    
identity_hate     0.762      0.148       0.248    


In [70]:
for i in range(len(conf_mat)):
    print(labal_arr[i].ljust(15, " "), end="")
    print(conf_mat[0][0].tolist())
    print(" " * 15, end="")
    print(conf_mat[0][1].tolist())
    print()

toxic          [29427, 189]
               [1306, 2160]

severe_toxic   [29427, 189]
               [1306, 2160]

obscene        [29427, 189]
               [1306, 2160]

threat         [29427, 189]
               [1306, 2160]

insult         [29427, 189]
               [1306, 2160]

identity_hate  [29427, 189]
               [1306, 2160]

