In [1]:
import numpy as np
import pandas as pd

comments_df = pd.read_csv("data/toxic-comment-classification-challenge/train.csv")

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(comments_df[['comment_text']], comments_df['toxic'], random_state=10)
X_train.head()

Unnamed: 0,comment_text
34852,"This is a straw man argument, Mr Merkey. Nobo..."
17133,"ARC Gritt, the fucking cunt of all cunts, ruin..."
124232,a whole week; couldn't you have said something...
52766,NIGHTSTALLION IS A CUNT
45760,"Welcome!\n\nHello, , and welcome to Wikipedia!..."


In [2]:
import re

import nltk
from nltk.stem import SnowballStemmer

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS = "€\?"
GOOD_SYMBOLS_RE = re.compile('([' + GOOD_SYMBOLS + '])')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z '+ GOOD_SYMBOLS + ']')
ADD_SPACES_SYMBOLS_RE = re.compile("([\?])")
STEMMER = SnowballStemmer('english')

class TextPreprocessor:
        
    def transfrom_text(self, text):
        text = re.sub(GOOD_SYMBOLS_RE, r"\1", text) #process good symbols
        text = text.lower()
        text = re.sub(REPLACE_BY_SPACE_RE, " ", text) # process bad symbols
        text = re.sub(BAD_SYMBOLS_RE, "", text) # process bad symbols
        text = re.sub(ADD_SPACES_SYMBOLS_RE, r" \1 ", text)
        test = " ".join([STEMMER.stem(word) for word in text.split()])
        return text
    
    def transform(self, series):
        return series.apply(lambda text: self.transfrom_text(text))
    
from sklearn.feature_extraction.text import TfidfVectorizer

class Vectorizer:

    def __init__(self):
        self.vectorizer = TfidfVectorizer(min_df=4, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
        
    def fit(self, column):
        self.vectorizer.fit(column)
        
    def transform(self, column):
        return self.vectorizer.transform(column)
    
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,\
    average_precision_score, roc_auc_score, recall_score

def scores(y, predicted):
    return {
        'accuracy': accuracy_score(y, predicted),
        'precision': precision_score(y, predicted),
        'recall': recall_score(y, predicted),
        'f1-score': f1_score(y, predicted),
        "roc_auc": roc_auc_score(y, predicted),
        'average-precision': average_precision_score(y, predicted)}


In [3]:
class TfidfPreprocessor:
    
    def __init__(self, colname="text"):
        self.colname = colname
        self.preprocessor = TextPreprocessor()
        self.vectorizer = Vectorizer()
           
    def fit(self, X):
        print("preprocessor...")
        X_fe = pd.DataFrame({self.colname: self.preprocessor.transform(X[self.colname])})
        print("vectorizer...")
        self.vectorizer.fit(X_fe[self.colname])
        
    def transform(self, X=None, message=None):
        if message is not None:
            X = pd.DataFrame({self.colname: [message]})
        X_fe = pd.DataFrame({self.colname: self.preprocessor.transform(X[self.colname])})        
        return self.vectorizer.transform(X_fe[self.colname])
    
    def fit_transform(self, X):
        self.fit(X)
        return self.vectorizer.transform(X[self.colname])

In [5]:
N = 1000
X_train = X_train.iloc[:N]
y_train = y_train[:N]
tfidf_preprocessor = TfidfPreprocessor("comment_text")
X_train_preproc = tfidf_preprocessor.fit_transform(X_train)
X_val_preproc = tfidf_preprocessor.transform(X_val)

preprocessor...
vectorizer...


## Sklearn model

In [20]:
from sklearn.linear_model import LogisticRegression
    
class SklearnModel:
    
    def __init__(self):
        self.model = LogisticRegression(class_weight='balanced')
    
    def fit(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, X):
        return self.model.predict(X)  

In [27]:
sklearn_model = SklearnModel()
sklearn_model.fit(X_train_preproc, y_train)
y_train_hat = sklearn_model.predict(X_train_preproc)
scores(y_train, y_train_hat)

{'accuracy': 0.953358177777035,
 'average-precision': 0.6641958777165086,
 'f1-score': 0.8007424858999072,
 'precision': 0.6819066147859922,
 'recall': 0.969738889849559,
 'roc_auc': 0.96067231602142}

In [28]:
y_val_hat = sklearn_model.predict(X_val_preproc)
scores(y_val, y_val_hat)

{'accuracy': 0.9331963001027749,
 'average-precision': 0.5239641176536308,
 'f1-score': 0.7035265324285237,
 'precision': 0.6010264208325413,
 'recall': 0.848175965665236,
 'roc_auc': 0.8950682123362819}

## Keras Model

In [6]:
input_dim = len(tfidf_preprocessor.vectorizer.vectorizer.vocabulary_)

In [29]:
from keras.models import Sequential
from keras.layers import Dense, Activation

class KerasModel:
    
    def __init__(self, input_dim=None):
        self.input_dim = input_dim
        self.model = None
        
    def create_model(self):
        self.model = Sequential()
        self.model.add(Dense(units=1, input_dim=self.input_dim, activation='sigmoid'))
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])
        return self
        
    def fit(self, X, y, **kwargs):
        self.model.fit(X, y, epochs=10, batch_size=32, **kwargs)
    
    def predict(self, X):
        return (self.model.predict(X)[:, 0] >= 0.5).astype(int)

In [30]:
keras_model = KerasModel(input_dim).create_model()
keras_model.fit(X_train_preproc, y_train, validation_data=(X_val_preproc, y_val))
y_train_hat = keras_model.predict(X_train_preproc)
scores(y_train, y_train_hat)

Train on 119678 samples, validate on 39893 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


{'accuracy': 0.9646217349888868,
 'average-precision': 0.6677545326382022,
 'f1-score': 0.7876629889669008,
 'precision': 0.9377836159541437,
 'recall': 0.6789728514611794,
 'roc_auc': 0.8370768874739668}

In [26]:
y_val_hat = keras_model.predict(X_val_preproc)
scores(y_val, y_val_hat)

{'accuracy': 0.9511944451407515,
 'average-precision': 0.5258250188839367,
 'f1-score': 0.666209497685582,
 'precision': 0.923040380047506,
 'recall': 0.5211909871244635,
 'roc_auc': 0.7583557590122525}

## PyTorch

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

X_train_preproc_torch = X_train_preproc.tocoo()
y_train_torch = torch.tensor(y_train.values)

In [15]:
class PytorchNet(nn.Module):  # inheriting from nn.Module!

    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, 1)
        
    def forward(self, X):
        return F.sigmoid(self.fc(X))
    


In [17]:
net = PytorchNet(input_dim)
print(net)

PytorchNet(
  (fc): Linear(in_features=3497, out_features=1, bias=True)
)


In [12]:
model = nn.Sequential(nn.Linear(input_dim, 1),
                     nn.Sigmoid())

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

X = X_train_preproc
X_dense = X.todense().astype(np.float32)
y = y_train

X_coo = X.tocoo()
X_torch = torch.from_numpy(X_dense)
        
y_torch = torch.tensor(y.values.astype(np.float32))

for epoch in range(50):
    y_pred = model(X_torch)
    loss = criterion(y_pred, y_torch)
    print('epoch: ', epoch,' loss: ', loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

epoch:  0  loss:  0.6875982880592346
epoch:  1  loss:  0.6614923477172852
epoch:  2  loss:  0.6365846395492554
epoch:  3  loss:  0.6128840446472168
epoch:  4  loss:  0.5903888940811157
epoch:  5  loss:  0.5690901279449463
epoch:  6  loss:  0.5489687919616699
epoch:  7  loss:  0.5299959778785706
epoch:  8  loss:  0.5121391415596008
epoch:  9  loss:  0.49535948038101196
epoch:  10  loss:  0.4796106815338135
epoch:  11  loss:  0.46484601497650146
epoch:  12  loss:  0.45101508498191833
epoch:  13  loss:  0.4380668103694916
epoch:  14  loss:  0.4259493350982666
epoch:  15  loss:  0.4146103858947754
epoch:  16  loss:  0.4039996862411499
epoch:  17  loss:  0.3940677046775818
epoch:  18  loss:  0.38476622104644775
epoch:  19  loss:  0.376049280166626
epoch:  20  loss:  0.3678734302520752
epoch:  21  loss:  0.3601960837841034
epoch:  22  loss:  0.3529788851737976
epoch:  23  loss:  0.3461852967739105
epoch:  24  loss:  0.339780330657959
epoch:  25  loss:  0.33373236656188965
epoch:  26  loss:  

  "Please ensure they have the same size.".format(target.size(), input.size()))


In [10]:
type(X_dense)

numpy.matrixlib.defmatrix.matrix

In [37]:
for epoch in range(50):
    # Forward Propagation
    y_pred = model(x)
    # Compute and print loss
    loss = criterion(y_pred, y)
    print('epoch: ', epoch,' loss: ', loss.item())
    # Zero the gradients
    optimizer.zero_grad()
    
    # perform a backward pass (backpropagation)
    loss.backward()
    
    # Update the parameters
    optimizer.step()

<generator object Module.parameters at 0x12a18e200>

In [39]:
type(X_train_preproc)

scipy.sparse.csr.csr_matrix

In [40]:
X_train_preproc.tocoo()

<119678x225329 sparse matrix of type '<class 'numpy.float64'>'
	with 8403625 stored elements in COOrdinate format>

In [43]:
type(y_train)

pandas.core.series.Series

In [None]:
torch.tensor(y_train.va)

In [32]:
X_train_preproc

<119678x225329 sparse matrix of type '<class 'numpy.float64'>'
	with 8403625 stored elements in Compressed Sparse Row format>

In [33]:
input_dim

225329

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])