In [1]:
import pandas as pd

TARGET_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
comments_df = pd.read_csv("data/toxic-comment-classification-challenge/train.csv")

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    comments_df[['comment_text']], comments_df[TARGET_COLS], random_state=10)
X_train.head()

Unnamed: 0,comment_text
34852,"This is a straw man argument, Mr Merkey. Nobo..."
17133,"ARC Gritt, the fucking cunt of all cunts, ruin..."
124232,a whole week; couldn't you have said something...
52766,NIGHTSTALLION IS A CUNT
45760,"Welcome!\n\nHello, , and welcome to Wikipedia!..."


In [3]:
comments_df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [4]:
import re

import nltk
from nltk.stem import SnowballStemmer

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS = "€\?"
GOOD_SYMBOLS_RE = re.compile('([' + GOOD_SYMBOLS + '])')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z '+ GOOD_SYMBOLS + ']')
ADD_SPACES_SYMBOLS_RE = re.compile("([\?])")
STEMMER = SnowballStemmer('english')

class TextPreprocessor:
        
    def transfrom_text(self, text):
        text = re.sub(GOOD_SYMBOLS_RE, r"\1", text) #process good symbols
        text = text.lower()
        text = re.sub(REPLACE_BY_SPACE_RE, " ", text) # process bad symbols
        text = re.sub(BAD_SYMBOLS_RE, "", text) # process bad symbols
        text = re.sub(ADD_SPACES_SYMBOLS_RE, r" \1 ", text)
        test = " ".join([STEMMER.stem(word) for word in text.split()])
        return text
    
    def transform(self, series):
        return series.apply(lambda text: self.transfrom_text(text))
    
from sklearn.feature_extraction.text import TfidfVectorizer

class Vectorizer:

    def __init__(self):
        self.vectorizer = TfidfVectorizer(min_df=4, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
        
    def fit(self, column):
        self.vectorizer.fit(column)
        
    def transform(self, column):
        return self.vectorizer.transform(column)
    
class TfidfPreprocessor:
    
    def __init__(self, colname="text"):
        self.colname = colname
        self.preprocessor = TextPreprocessor()
        self.vectorizer = Vectorizer()
           
    def fit(self, X):
        print("preprocessor...")
        X_fe = pd.DataFrame({self.colname: self.preprocessor.transform(X[self.colname])})
        print("vectorizer...")
        self.vectorizer.fit(X_fe[self.colname])
        
    def transform(self, X=None, message=None):
        if message is not None:
            X = pd.DataFrame({self.colname: [message]})
        X_fe = pd.DataFrame({self.colname: self.preprocessor.transform(X[self.colname])})        
        return self.vectorizer.transform(X_fe[self.colname])
    
    def fit_transform(self, X):
        self.fit(X)
        return self.vectorizer.transform(X[self.colname])

In [8]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, recall_score

def scores(y, predicted):
    return {
        'accuracy': accuracy_score(y, predicted),
        'f1-score': f1_score(y, predicted, average='weighted'),
        "roc_auc": roc_auc_score(y, predicted),
        'average-precision': average_precision_score(y, predicted)}

In [9]:
N = 1000
X_train = X_train.iloc[:N]
y_train = y_train[:N]
tfidf_preprocessor = TfidfPreprocessor("comment_text")
X_train_preproc = tfidf_preprocessor.fit_transform(X_train)
X_val_preproc = tfidf_preprocessor.transform(X_val)

preprocessor...
vectorizer...


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

class SklearnModel:
    
    def __init__(self):
        clf = LogisticRegression(class_weight='balanced')
        self.model = OneVsRestClassifier(clf)
    
    def fit(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, X):
        return self.model.predict(X)

In [10]:
model = SklearnModel()
model.fit(X_train_preproc, y_train)
y_train_hat = model.predict(X_train_preproc)
scores(y_train, y_train_hat)

{'accuracy': 0.981,
 'average-precision': 0.8416666666666667,
 'f1-score': 0.9817152805862259,
 'roc_auc': 0.989432703003337}

In [11]:
y_val_hat = model.predict(X_val_preproc)
scores(y_val, y_val_hat)

{'accuracy': 0.9208382422981476,
 'average-precision': 0.3568701470236531,
 'f1-score': 0.9194257057461095,
 'roc_auc': 0.7493119095597254}