In [1]:
import pandas as pd
data = pd.read_csv('toxic_data.csv')
print('Shape of the data: ', data.shape)
data.head()

Shape of the data:  (159571, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [2]:
y_cols = list(data.columns[2:])
is_multilabel = (data[y_cols].sum(axis=1) >1).count()
print('is_multilabel count: ', is_multilabel)

data['non_toxic'] = 1-data[y_cols].max(axis=1)
y_cols += ['non_toxic']

is_multilabel count:  159571


In [3]:
from sklearn.model_selection import train_test_split

def get_train_test_val(data):
    X_data = data['comment_text'].values
    y_data = data[list(data.columns[2:])].values
    X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1, train_size=0.9)
    X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25,train_size =0.75)

    print(
        'X_train shape', X_train.shape,
        '\ny_train shape', y_train.shape,
        '\nX_test shape', X_test.shape,
        '\ny_test shape', y_test.shape,
        '\nX_val shape', X_val.shape,
        '\ny_val shape', y_val.shape,

    )
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = get_train_test_val(data)

X_train shape (107709,) 
y_train shape (107709, 7) 
X_test shape (15958,) 
y_test shape (15958, 7) 
X_val shape (35904,) 
y_val shape (35904, 7)


In [4]:
X_train[420]

'Okay, I got it. Thanks.'

In [5]:
import numpy as np

def label_encoder(array):
    rows, cols = array.shape
    label = np.zeros(rows)
    rows, cols = array.shape
    for i in range(rows):
        for j in range(cols):
            if(array[i][j] == 1):
                label[i] = j
    
    return label

In [6]:
y_train = label_encoder(y_train)
y_val = label_encoder(y_val)
y_test = label_encoder(y_test)

In [7]:
import numpy as np
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
nltk.download('punkt')

import re

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.replace('\n', ' ').lower()# lowercase text
    text = REPLACE_IP_ADDRESS.sub('', text) # remove ip address
    text = REPLACE_BY_SPACE_RE.sub(' ',text)# replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('',text)# delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([w for w in text.split() if not w in STOPWORDS])# delete stopwords from text
    return text

X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]
X_train[:2]

[nltk_data] Downloading package stopwords to /home/sysadm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sysadm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['ketuanan melayu never mentioned constitution mentions hak keistimewaan orang melayu nothing mentioned aborigin rights like countries like canada singapore ketuanan melayu invented im sure exactly first used one may say derivation article 153 gone far beneficial abused used tool barisan nasional gain rationale behind hak keistimewaan orang melayu claimed malays aborigins although dont really agree peranakans settled malacca 15th centuries true aborigins none orang aslis lets assume fact malays aborigins true article 153 never introduced malays first policy rather means protect malay rights well rights races country article 153 never controversial rather abused certain politicians hope makes clear prominent example happens country right government scholarship read almost everywhere isnt already official discrimination read like btn courses always refers chinese include peranakan generations indians pendatang fact parameswara pendatang also khir toyo although malaysia officially enthnoc

In [8]:
from scipy import sparse as sp_sparse

words_counts = {}
for comments in X_train:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 1
        words_counts[word] += 1
        
DICT_SIZE = 10000 # Test with multiple values
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

In [9]:
def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_val_mybag.shape, '\nX_test shape ', X_test_mybag.shape)

X_train shape  (107709, 10000) 
X_val shape  (35904, 10000) 
X_test shape  (15958, 10000)


In [10]:
from sklearn.svm import SVC

clf = SVC(gamma='auto')
clf.fit(X_train_mybag, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

y_val_predicted_labels_mybag = clf.predict(X_val_mybag)

print('Accuracy: ', accuracy_score(y_val, y_val_predicted_labels_mybag))
print('F1-score macro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='macro'))
print('F1-score micro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='micro'))
print('F1-score weighted: ', f1_score(y_val, y_val_predicted_labels_mybag, average='weighted'))

Accuracy:  0.9001225490196079
F1-score macro:  0.1428666837261821
F1-score micro:  0.9001225490196079
F1-score weighted:  0.8545045696869781


In [12]:
y_test_predicted_labels_mybag = clf.predict(X_test_mybag)

print('Accuracy: ', accuracy_score(y_test, y_test_predicted_labels_mybag))

Accuracy:  0.9008647700213059


In [13]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state = 420)
clf.fit(X_train_mybag, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=420, splitter='best')

In [14]:
y_val_predicted_labels_mybag = clf.predict(X_val_mybag)

print('Accuracy: ', accuracy_score(y_val, y_val_predicted_labels_mybag))
print('F1-score macro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='macro'))
print('F1-score micro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='micro'))
print('F1-score weighted: ', f1_score(y_val, y_val_predicted_labels_mybag, average='weighted'))

Accuracy:  0.8876726827094474
F1-score macro:  0.3241923457275972
F1-score micro:  0.8876726827094473
F1-score weighted:  0.8918587799838765


In [15]:
y_test_predicted_labels_mybag = clf.predict(X_test_mybag)

print('Accuracy: ', accuracy_score(y_test, y_test_predicted_labels_mybag))

Accuracy:  0.8890838450933701


In [16]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors = 7)
clf.fit(X_train_mybag, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [17]:
y_val_predicted_labels_mybag = clf.predict(X_val_mybag)

print('Accuracy: ', accuracy_score(y_val, y_val_predicted_labels_mybag))
print('F1-score macro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='macro'))
print('F1-score micro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='micro'))
print('F1-score weighted: ', f1_score(y_val, y_val_predicted_labels_mybag, average='weighted'))

Accuracy:  0.9122381907308378
F1-score macro:  0.2507564828588634
F1-score micro:  0.9122381907308378
F1-score weighted:  0.8843700160458178


In [18]:
y_test_predicted_labels_mybag = clf.predict(X_test_mybag)

print('Accuracy: ', accuracy_score(y_test, y_test_predicted_labels_mybag))

Accuracy:  0.9133976688808121


In [19]:
from scipy import sparse as sp_sparse

words_counts = {}
for comments in X_train:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 1
        words_counts[word] += 1
        
DICT_SIZE = 5000 #Test with multiple values
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

In [20]:
def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_val_mybag.shape, '\nX_test shape ', X_test_mybag.shape)

X_train shape  (107709, 5000) 
X_val shape  (35904, 5000) 
X_test shape  (15958, 5000)


In [21]:
from sklearn.svm import SVC

clf = SVC(gamma='auto')
clf.fit(X_train_mybag, y_train)

y_val_predicted_labels_mybag = clf.predict(X_val_mybag)

print('Accuracy: ', accuracy_score(y_val, y_val_predicted_labels_mybag))
print('F1-score macro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='macro'))
print('F1-score micro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='micro'))
print('F1-score weighted: ', f1_score(y_val, y_val_predicted_labels_mybag, average='weighted'))

Accuracy:  0.9019050802139037
F1-score macro:  0.15328861357286658
F1-score micro:  0.9019050802139037
F1-score weighted:  0.858769002837596


In [22]:
y_test_predicted_labels_mybag = clf.predict(X_test_mybag)

print('Accuracy: ', accuracy_score(y_test, y_test_predicted_labels_mybag))

Accuracy:  0.9028073693445294


In [23]:
clf = DecisionTreeClassifier(random_state = 420)
clf.fit(X_train_mybag, y_train)

y_val_predicted_labels_mybag = clf.predict(X_val_mybag)

print('Accuracy: ', accuracy_score(y_val, y_val_predicted_labels_mybag))
print('F1-score macro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='macro'))
print('F1-score micro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='micro'))
print('F1-score weighted: ', f1_score(y_val, y_val_predicted_labels_mybag, average='weighted'))

Accuracy:  0.8814338235294118
F1-score macro:  0.3247828240220447
F1-score micro:  0.8814338235294117
F1-score weighted:  0.8874982497325395


In [24]:
y_test_predicted_labels_mybag = clf.predict(X_test_mybag)

print('Accuracy: ', accuracy_score(y_test, y_test_predicted_labels_mybag))

Accuracy:  0.8840706855495676


In [25]:
clf = KNeighborsClassifier(n_neighbors = 7)
clf.fit(X_train_mybag, y_train)

y_val_predicted_labels_mybag = clf.predict(X_val_mybag)

print('Accuracy: ', accuracy_score(y_val, y_val_predicted_labels_mybag))
print('F1-score macro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='macro'))
print('F1-score micro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='micro'))
print('F1-score weighted: ', f1_score(y_val, y_val_predicted_labels_mybag, average='weighted'))

Accuracy:  0.9119875222816399
F1-score macro:  0.25615620836028585
F1-score micro:  0.9119875222816399
F1-score weighted:  0.8849256340804693


In [26]:
y_test_predicted_labels_mybag = clf.predict(X_test_mybag)

print('Accuracy: ', accuracy_score(y_test, y_test_predicted_labels_mybag))

Accuracy:  0.913147010903622


In [27]:
from scipy import sparse as sp_sparse

words_counts = {}
for comments in X_train:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 1
        words_counts[word] += 1
        
DICT_SIZE = 15000 #Test with multiple values
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

In [28]:
def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_val_mybag.shape, '\nX_test shape ', X_test_mybag.shape)

X_train shape  (107709, 15000) 
X_val shape  (35904, 15000) 
X_test shape  (15958, 15000)


In [29]:
clf = SVC(gamma='auto')
clf.fit(X_train_mybag, y_train)

y_val_predicted_labels_mybag = clf.predict(X_val_mybag)

print('Accuracy: ', accuracy_score(y_val, y_val_predicted_labels_mybag))
print('F1-score macro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='macro'))
print('F1-score micro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='micro'))
print('F1-score weighted: ', f1_score(y_val, y_val_predicted_labels_mybag, average='weighted'))

Accuracy:  0.8998440285204992
F1-score macro:  0.14056989255594243
F1-score micro:  0.8998440285204992
F1-score weighted:  0.8536176850951761


In [30]:
y_test_predicted_labels_mybag = clf.predict(X_test_mybag)

print('Accuracy: ', accuracy_score(y_test, y_test_predicted_labels_mybag))

Accuracy:  0.9006767765384134


In [31]:
clf = DecisionTreeClassifier(random_state = 420)
clf.fit(X_train_mybag, y_train)

y_val_predicted_labels_mybag = clf.predict(X_val_mybag)

print('Accuracy: ', accuracy_score(y_val, y_val_predicted_labels_mybag))
print('F1-score macro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='macro'))
print('F1-score micro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='micro'))
print('F1-score weighted: ', f1_score(y_val, y_val_predicted_labels_mybag, average='weighted'))

Accuracy:  0.8919618983957219
F1-score macro:  0.34455099573133285
F1-score micro:  0.8919618983957219
F1-score weighted:  0.8950971644847524


In [32]:
y_test_predicted_labels_mybag = clf.predict(X_test_mybag)

print('Accuracy: ', accuracy_score(y_test, y_test_predicted_labels_mybag))

Accuracy:  0.890023812507833


In [33]:
clf = KNeighborsClassifier(n_neighbors = 7)
clf.fit(X_train_mybag, y_train)

y_val_predicted_labels_mybag = clf.predict(X_val_mybag)

print('Accuracy: ', accuracy_score(y_val, y_val_predicted_labels_mybag))
print('F1-score macro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='macro'))
print('F1-score micro: ', f1_score(y_val, y_val_predicted_labels_mybag, average='micro'))
print('F1-score weighted: ', f1_score(y_val, y_val_predicted_labels_mybag, average='weighted'))

Accuracy:  0.9117368538324421
F1-score macro:  0.25425470020795093
F1-score micro:  0.9117368538324421
F1-score weighted:  0.8837380749470691


In [34]:
y_test_predicted_labels_mybag = clf.predict(X_test_mybag)

print('Accuracy: ', accuracy_score(y_test, y_test_predicted_labels_mybag))

Accuracy:  0.9140243138237875


In [39]:
from tabulate import tabulate
 
# assign data
mydata = [
    ["SVM", "10000", "90 %", "90 %"],
    ["DT", "10000", "88 %", "88 %"],
    ["KNN", "10000", "91 %", "91 %"],
    ["SVM", "5000", "90 %", "90 %"],
    ["DT", "5000", "88 %", "88 %"],
    ["KNN", "5000", "91 %", "91 %"],
    ["SVM", "15000", "89 %", "90 %"],
    ["DT", "15000", "88 %", "88 %"],
    ["KNN", "15000", "91 %", "91 %"]
]
 
# create header
head = ["Classifier", "Dictionary Size", "Validation Accuracy", "Test Accuracy"]
 
# display table
print(tabulate(mydata, headers=head, tablefmt="grid"))

+--------------+-------------------+-----------------------+-----------------+
| Classifier   |   Dictionary Size | Validation Accuracy   | Test Accuracy   |
| SVM          |             10000 | 90 %                  | 90 %            |
+--------------+-------------------+-----------------------+-----------------+
| DT           |             10000 | 88 %                  | 88 %            |
+--------------+-------------------+-----------------------+-----------------+
| KNN          |             10000 | 91 %                  | 91 %            |
+--------------+-------------------+-----------------------+-----------------+
| SVM          |              5000 | 90 %                  | 90 %            |
+--------------+-------------------+-----------------------+-----------------+
| DT           |              5000 | 88 %                  | 88 %            |
+--------------+-------------------+-----------------------+-----------------+
| KNN          |              5000 | 91 %           