In [22]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#read training csv file from dataset
data_path = os.path.join('..', '..', 'jigsaw-unintended-bias-in-toxicity-classification', 'train.csv')
data_raw = pd.read_csv(data_path)

In [35]:
categories = list(data_raw.columns.values)
categories = categories[3:7]
print(categories)


['severe_toxicity', 'obscene', 'identity_attack', 'insult']


In [24]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\famou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

# stem data to combine words with similar meanings
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

In [26]:
data = data_raw
data = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['comment_text'] = data['comment_text'].apply(removeStopWords)
data.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
1368365,5790013,0.142857,False Equivalency called 'civil war' sl...,0.0,0.142857,0.0,0.142857,0.0,,,...,367558,approved,0,2,1,0,10,0.0,0,7
127031,397377,0.0,Jerry Swanson absolutely right. Tour operator...,0.0,0.0,0.0,0.0,0.0,,,...,141986,approved,0,0,0,3,0,0.0,0,4
1227133,5614509,0.0,great many Non-aboriginals simply getting fa...,0.0,0.0,0.0,0.0,0.0,,,...,356531,approved,1,0,0,25,2,0.0,0,4
1278372,5676670,0.0,since 1978. Alaska non-stop social expe...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,360046,approved,0,0,0,2,1,0.0,4,4
1482520,5934438,0.4,makes really wonder combined corporate int...,0.0,0.0,0.0,0.4,0.0,0.0,0.0,...,376312,rejected,0,0,0,0,0,0.0,6,10


In [27]:
data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)
data['comment_text'] = data['comment_text'].apply(stemming)
data.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
1368365,5790013,0.142857,false equivalency called civil war slavery eve...,0.0,0.142857,0.0,0.142857,0.0,,,...,367558,approved,0,2,1,0,10,0.0,0,7
127031,397377,0.0,jerry swanson absolutely right tour operators ...,0.0,0.0,0.0,0.0,0.0,,,...,141986,approved,0,0,0,3,0,0.0,0,4
1227133,5614509,0.0,great many non aboriginals simply getting fati...,0.0,0.0,0.0,0.0,0.0,,,...,356531,approved,1,0,0,25,2,0.0,0,4
1278372,5676670,0.0,since alaska non stop social experiment come...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,360046,approved,0,0,0,2,1,0.0,4,4
1482520,5934438,0.4,makes really wonder combined corporate intelli...,0.0,0.0,0.0,0.4,0.0,0.0,0.0,...,376312,rejected,0,0,0,0,0,0.0,6,10


In [29]:
#split data
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)

print(train.shape)
print(test.shape)

(1400, 45)
(600, 45)


In [30]:
train_text = train['comment_text']
test_text = test['comment_text']

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents='unicode',
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [32]:
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['id','comment_text'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['id','comment_text'], axis=1)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import LabelPowerset

In [36]:
%%time

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    printmd('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing severe_toxicity comments...**

ValueError: Unknown label type: (1798975    0.0
1503034    0.1
685141     0.0
504795     0.0
1282993    0.0
          ... 
1160928    0.0
73149      0.0
1699302    0.0
607985     0.0
1069835    0.0
Name: severe_toxicity, Length: 1400, dtype: float64,)

In [21]:
%%time

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

TypeError: no supported conversion for types: (dtype('O'),)