In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.classify import apply_features

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.multiclass import OneVsRestClassifier
  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [2]:
# Display setting to show more characters
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 999

In [3]:
df = pd.read_csv("C:/Users/User/Downloads/cleaned_data.csv")

In [4]:
df.shape

(50960, 10)

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,cleaned_comment_text
0,2f52adcf5a111cd3,"That is about it for for now. Primarily, I worked on citations (either adding or updating) and related format or reconciling previous contributions into a more NPOV (or BPOV) format. Feel free to smoke it over.",0,0,0,0,0,0,1,primarily work citations either add update relate format reconcile previous contributions npov bpov format feel free smoke
1,819b3339c747286f,"""\n I wasn't aware that peer-reviewed studies with minimal to no methodological flaws (you know.. what the page i linked is citing) constitute """"biased sources"""". *restrains self from becoming extremely sarcastic* """,0,0,0,0,0,0,1,aware peerreviewed study minimal methodological flaw know page link cite constitute bias source restrain self become extremely sarcastic
2,b66e5fffbd70f8fe,"""\nIt's fine to edit for personal gain so long as you're editing according to basic policies. Contributing to Wikipedia is a hobby for most people, and there is an endless number of motivations for getting involved. â''''''Â |Â Talk """,0,0,0,0,0,0,1,fine edit personal gain long edit accord basic policies contribute wikipedia hobby people endless number motivations get involve talk
3,fd7f2ec6efe0315d,"I did not add these words to the PLANS website. I have no contact with PLANS. PLANS is a skeptic site and the language you quote is strong but not offensive to a most people. PLANS is cited as evidence of a balancing point of view, Very little of its content is present in the Wikipedia article.",0,0,0,0,0,0,1,add word plan website contact plan plan skeptic site language quote strong offensive people plan cite evidence balance point view little content present wikipedia article
4,f78b624060552c1a,"""\n\n List of recent changes \n\nRequested by Sarge Baldy, even though they've already been discussed extensively.\n\n We don't need minutia about Cromwell in the intro.\n This article is about the political meaning of anarchism """"the belief that forms of rulership are undesireable, and should be abolished.""""\n We don't need the gobbletygook about """"degrees of commonality and conflict.""""\n Please don't hide the fact that Proudhon was anti-communist.\n Proudhon details should go in the Proudhon article, not here.\n Anarchist """"schools"""" should precede all the sundry issue-oriented sects.\n Bullshit about misc. non-anarchist leftie movements don't belong in the [i]anarchism without adjectives[/i] paragraph.\n Ancap books should not be censored.""",0,0,0,0,0,0,1,list recent change request sarge baldy even though already discuss extensively need minutia cromwell intro article political mean anarchism belief form rulership undesireable abolish need gobbletygook degrees commonality conflict please hide fact proudhon anticommunist proudhon detail go proudhon article anarchist school precede sundry issueoriented sects bullshit misc nonanarchist leftie movements belong anarchism without adjectives i paragraph ancap book censor


In [6]:
# check for null values
df.isnull().sum()

id                      0
comment_text            0
toxic                   0
severe_toxic            0
obscene                 0
threat                  0
insult                  0
identity_hate           0
clean                   0
cleaned_comment_text    0
dtype: int64

In [7]:
# subset the dataframe
df1 = df.iloc[:,[0,9,2,3,4,5,6,7]]

In [8]:
df1.head()

Unnamed: 0,id,cleaned_comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,2f52adcf5a111cd3,primarily work citations either add update relate format reconcile previous contributions npov bpov format feel free smoke,0,0,0,0,0,0
1,819b3339c747286f,aware peerreviewed study minimal methodological flaw know page link cite constitute bias source restrain self become extremely sarcastic,0,0,0,0,0,0
2,b66e5fffbd70f8fe,fine edit personal gain long edit accord basic policies contribute wikipedia hobby people endless number motivations get involve talk,0,0,0,0,0,0
3,fd7f2ec6efe0315d,add word plan website contact plan plan skeptic site language quote strong offensive people plan cite evidence balance point view little content present wikipedia article,0,0,0,0,0,0
4,f78b624060552c1a,list recent change request sarge baldy even though already discuss extensively need minutia cromwell intro article political mean anarchism belief form rulership undesireable abolish need gobbletygook degrees commonality conflict please hide fact proudhon anticommunist proudhon detail go proudhon article anarchist school precede sundry issueoriented sects bullshit misc nonanarchist leftie movements belong anarchism without adjectives i paragraph ancap book censor,0,0,0,0,0,0


### Train / Test Split

In [9]:
# create train and test set (note that x and y are not separated due to nltk's way of feature selection)
train, test = train_test_split(df1, test_size = 0.2, random_state = 2019)

In [10]:
train.head()

Unnamed: 0,id,cleaned_comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
30200,26de3070fdf8f9de,perhaps perhaps che pig,1,0,0,0,1,0
34498,f19946d1a3097deb,three women elgar life watch min tv documentary elgar present important new information surprise even biographers raise question mind earth article reach fa status without mention least windflower whose existence well know doubt elgar affairs heart central creativity huge influence work repeat article reach present state barely mention central aspect life,0,0,0,0,0,0
25278,b5bd02e258f0f42f,edit infobox add audio p nks performance live nd annual grammy award release digital download februar source also add album version song release digital download single june source write release infobox january us radio february digital download live nd annual grammy award digital download,0,0,0,0,0,0
29073,aef5fc6d99e28f6f,weird indeed put four tildes keep come unsigned anyway edit signature think work,0,0,0,0,0,0
9290,aef41c4a4d12c0f3,contrary recommendation,0,0,0,0,0,0


In [11]:
test.head()

Unnamed: 0,id,cleaned_comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
15420,2cbd8b508cc7c2ac,look good change source talk theme musical style specify kissoff anthem indeed true judge lyric write good talk,0,0,0,0,0,0
19180,0f03dd668d7fc2f0,really dog photo dog steal post page nastered priyadarshivishal,1,0,0,0,1,0
46897,f5e25d1b15a9dbf4,listen children night music make,0,0,0,0,0,0
4110,18772a6a840177fb,thank thank redirect link idk already page,0,0,0,0,0,0
23094,743f63f120b08059,tribe war muslims,0,0,0,0,0,0


### Feature Selection

In [12]:
def extract_label_features(df, label, flag, topwords):
    
    # extract comments from label==flag (1 or 0)
    a = df[df[label]==flag].values.tolist()
    
    # combine comments into a single list
    b = [item[1] for item in a]
    
    # word tokenize each comment
    c = [word_tokenize(i) for i in b]
    
    # remove additional stopwords and words with 2 characters
    stop_list = ['wikipedia', 'wiki']
    d = [[w for w in doc if w not in stop_list and len(w)>2] for doc in c]
   
    # convert nested list into a single list
    e = [w for doc in d for w in doc]
    
    # frequency distribution of words
    f = nltk.FreqDist(e)
    print(len(f))
    
    # create word features using only top most common words in all documents
    word_features = [item[0] for item in f.most_common(topwords)]
    
    return word_features

In [13]:
toxic1 = extract_label_features(train, 'toxic', 1, 1000)

12505


In [14]:
print(toxic1)

['fuck', 'shit', 'pig', 'nigger', 'gay', 'wanker', 'suck', 'ball', 'like', 'get', 'faggot', 'page', 'edit', 'know', 'block', 'make', 'super', 'buttsecks', 'bullshit', 'say', 'people', 'stupid', 'article', 'talk', 'moron', 'poop', 'huge', 'ass', 'want', 'think', 'one', 'rape', 'stop', 'anal', 'would', 'asshole', 'eat', 'hate', 'cock', 'bollocks', 'ban', 'kill', 'yourselfgo', 'time', 'bitch', 'see', 'keep', 'bunksteve', 'come', 'right', 'even', 'take', 'need', 'person', 'delete', 'use', 'work', 'piece', 'life', 'hell', 'try', 'remove', 'love', 'fack', 'give', 'little', 'fag', 'vomit', 'really', 'look', 'please', 'idiot', 'hey', 'fucker', 'whore', 'tell', 'good', 'god', 'user', 'big', 'call', 'fire', 'post', 'well', 'jdelanoy', 'dont', 'cocksucker', 'penis', 'nice', 'anthony', 'also', 'fat', 'way', 'write', 'drink', 'bradbury', 'still', 'attack', 'cunt', 'mothjer', 'comment', 'nothing', 'back', 'never', 'leave', 'many', 'guy', 'read', 'mean', 'information', 'lick', 'bleachanhero', 'source

In [14]:
severe1 = extract_label_features(train, 'severe_toxic', 1, 300)

1897


In [15]:
obscene1 = extract_label_features(train, 'obscene', 1, 300)

7877


In [16]:
threat1 = extract_label_features(train, 'threat', 1, 300)

952


In [17]:
insult1 = extract_label_features(train, 'insult', 1, 300)

7500


In [18]:
identity1 = extract_label_features(train, 'identity_hate', 1, 300)

2516


In [19]:
# combine word features of all labels
word_features = toxic1 + severe1 + obscene1 + threat1 + insult1 + identity1

In [20]:
# Define function for feature extraction
def document_features(document):
    document_words1 = word_tokenize(document)
    document_words2 = set(document_words1)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words2)
    return features

### multilabel classification

In [21]:
# transform train features into vectors
train1 = train.values.tolist()
train2 = [item[1] for item in train1]
train_feature = [document_features(i) for i in train2]
v = DictVectorizer()
train_feature1 = v.fit_transform(train_feature)

In [22]:
# transform test features into vectors
test1 = test.values.tolist()
test2 = [item[1] for item in test1]
test_feature = [document_features(i) for i in test2]
test_feature1 = v.transform(test_feature) 

In [23]:
NB_pipeline = Pipeline([('clf', OneVsRestClassifier(MultinomialNB())),])

In [24]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for category in categories:
    
    print('... Processing {}'.format(category))
    
    # train the model
    NB_pipeline.fit(train_feature1, train[category])
    
    # compute the testing accuracy
    y_pred = NB_pipeline.predict(test_feature1)
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], y_pred)))
    print('\nConfusion matrix:\n', confusion_matrix(test[category], y_pred))
    print('\nClasification report:\n', classification_report(test[category], y_pred))

... Processing toxic
Test accuracy is 0.9448587127158555

Confusion matrix:
 [[8907  304]
 [ 258  723]]

Clasification report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      9211
           1       0.70      0.74      0.72       981

    accuracy                           0.94     10192
   macro avg       0.84      0.85      0.84     10192
weighted avg       0.95      0.94      0.95     10192

... Processing severe_toxic
Test accuracy is 0.9721350078492935

Confusion matrix:
 [[9841  256]
 [  28   67]]

Clasification report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     10097
           1       0.21      0.71      0.32        95

    accuracy                           0.97     10192
   macro avg       0.60      0.84      0.65     10192
weighted avg       0.99      0.97      0.98     10192

... Processing obscene
Test accuracy is 0.9668367346938775

Confusion matrix:
 [[94

In [25]:
multilabel_pipeline = Pipeline([('clf', OneVsRestClassifier(MultinomialNB())),])

categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# train the model
multilabel_pipeline.fit(train_feature1, train[categories])

# compute the testing accuracy
y_pred = multilabel_pipeline.predict(test_feature1)

In [29]:
test_feature1.shape

(10192, 1103)

In [26]:
y_pred

array([[0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1]])

In [27]:
y_pred.shape

(10192, 6)

In [28]:
no_of_labels = 6

In [32]:
test.iloc[:,2:8]

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
15420,0,0,0,0,0,0
19180,1,0,0,0,1,0
46897,0,0,0,0,0,0
4110,0,0,0,0,0,0
23094,0,0,0,0,0,0
...,...,...,...,...,...,...
10664,0,0,0,0,0,0
31309,1,0,1,0,1,0
29287,0,0,0,0,0,0
15328,0,0,0,0,0,0


In [33]:
test.iloc[:,2:8].values

array([[0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0]], dtype=int64)

In [34]:
y_true = test.iloc[:,2:8].values

In [35]:
y_true

array([[0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0]], dtype=int64)

In [36]:
y_true.shape

(10192, 6)

In [37]:
# confusion matrix
mcm = multilabel_confusion_matrix(y_true, y_pred)
print(mcm)

[[[ 8907   304]
  [  258   723]]

 [[ 9841   256]
  [   28    67]]

 [[ 9406   237]
  [  101   448]]

 [[10007   153]
  [   19    13]]

 [[ 9389   289]
  [  130   384]]

 [[ 9857   237]
  [   34    64]]]


In [38]:
# TP, FP, FN, TN

TP = list()
FP = list()
FN = list()
TN = list()

for i in range(no_of_labels):
    TN_i = mcm[i][0][0]
    FP_i = mcm[i][0][1]
    FN_i = mcm[i][1][0]
    TP_i = mcm[i][1][1]
    
    TP.append(TP_i)
    FP.append(FP_i)
    FN.append(FN_i)
    TN.append(TN_i)
    
print(TP)
print(FP)
print(FN)
print(TN)

# e.g. TP for label 0 ==> TP[0]

[723, 67, 448, 13, 384, 64]
[304, 256, 237, 153, 289, 237]
[258, 28, 101, 19, 130, 34]
[8907, 9841, 9406, 10007, 9389, 9857]


In [39]:
# accuracy

accuracy = list()
for i in range(no_of_labels):
    accuracy_i = (TP[i] + TN[i]) / (TP[i] + FP[i] + FN[i] + TN[i])
    accuracy.append(accuracy_i)

print(accuracy)

[0.9448587127158555, 0.9721350078492935, 0.9668367346938775, 0.9831240188383046, 0.9588893249607535, 0.9734105180533752]


In [40]:
# precision, recall, f1 score

precision = list()
recall = list()
f1_score = list()

for i in range(no_of_labels):
    precision_i = (TP[i]) / ((TP[i] + FP[i]) or not (TP[i] + FP[i]))
    precision.append(precision_i)
    
    recall_i = (TP[i]) / ((TP[i] + FN[i]) or not (TP[i] + FN[i]))
    recall.append(recall_i)
    
    f1_score_i = (2 * precision_i * recall_i) / ((precision_i + recall_i) or not (precision_i + recall_i))
    f1_score.append(f1_score_i)
    
print(precision)
print(recall)
print(f1_score)

[0.7039922103213242, 0.20743034055727555, 0.654014598540146, 0.0783132530120482, 0.5705794947994056, 0.21262458471760798]
[0.7370030581039755, 0.7052631578947368, 0.8160291438979964, 0.40625, 0.7470817120622568, 0.6530612244897959]
[0.7201195219123505, 0.3205741626794259, 0.726094003241491, 0.13131313131313133, 0.6470092670598147, 0.32080200501253137]


In [41]:
# average accuracy

avg_accuracy = sum(accuracy) / no_of_labels
print(avg_accuracy)

0.9665423861852434


In [42]:
# micro averaging

micro_precision = sum(TP) / (sum(TP) + sum(FP))
micro_recall = sum(TP) / (sum(TP) + sum(FN))
micro_f1_score = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall)

print(micro_precision)
print(micro_recall)
print(micro_f1_score)

0.5351181102362205
0.748788012340238
0.6241734019103601
