In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import re, string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_label = pd.read_csv('test_labels.csv')

In [3]:
train.shape

(159571, 8)

In [4]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            153164 non-null  object
 1   comment_text  153164 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [7]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
train["comment_text"].fillna("unknown", inplace=True)
test["comment_text"].fillna("unknown", inplace=True)

In [9]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

# Change -1 to 0 cause y_test and predictions has different values interpreations 
def zerone(y_test):
    for i in range(len(y_test)):
        if y_test[i]==-1:
            y_test[i]=0
    return y_test

## Using Naive Bayes

In [12]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for i, j in enumerate(label_cols):
    # Implement fitting to every column
    X_train = train.comment_text
    y_train = train[j]
    X_test = test.comment_text
    y_test = test_label[j]
    
    # transoform to vector
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    #realization of Naive Bayes
    x1, y1 = X_train, y_train
    p = x1[y1==1].sum(0)+1
    q = x1[y1==0].sum(0)+1
    r = np.log((p/p.sum())/(q/q.sum()))
    b = np.log(len(p)/len(q))
    
    #predict
    pre_preds = X_test@r.T+b
    preds = pre_preds.T>0
    y_test = list(zerone(y_test))
    acc = (preds==y_test).mean()
    print(j, ":", acc)

toxic : 0.869714815491891
severe_toxic : 0.9734728787443525
obscene : 0.9141443158966859
threat : 0.9825285315087097
insult : 0.9208169021441069
identity_hate : 0.9748635449583453


In [14]:
for i, j in enumerate(label_cols):
    # Implement fitting to every column
    X_train = train.comment_text
    y_train = train[j]
    X_test = test.comment_text
    y_test = test_label[j]
    
    # transoform to vector
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    
    lgr = LogisticRegression(C=0.1)
    lgr.fit(X_train.sign(), y_train)
    #predict
    preds = lgr.predict(X_test)
    y_test = list(zerone(y_test))
    acc = (preds==y_test).mean()
    print(j, ":", acc)

toxic : 0.9492504766133034
severe_toxic : 0.9976038755843409
obscene : 0.9718667572014311
threat : 0.9986223916847301
insult : 0.977129090386775
identity_hate : 0.9953513880546343
