In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
import re
import string

from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# loss funtion used to evalate our model performance
# this is specifically required in Kaggle challenge description
def loss(y, y_pred):
    assert y.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

In [3]:
# load data and merge them into one to preprocess
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
nrow_train = train.shape[0]
df = pd.concat([train,test],0)

In [4]:
df.head()

Unnamed: 0,comment_text,id,identity_hate,insult,obscene,severe_toxic,threat,toxic
0,Explanation\nWhy the edits made under my usern...,0000997932d777bf,0.0,0.0,0.0,0.0,0.0,0.0
1,D'aww! He matches this background colour I'm s...,000103f0d9cfb60f,0.0,0.0,0.0,0.0,0.0,0.0
2,"Hey man, I'm really not trying to edit war. It...",000113f07ec002fd,0.0,0.0,0.0,0.0,0.0,0.0
3,"""\nMore\nI can't make any real suggestions on ...",0001b41b1c6bb37e,0.0,0.0,0.0,0.0,0.0,0.0
4,"You, sir, are my hero. Any chance you remember...",0001d958c54c6e35,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# a customized tokenize function to remove stop words and puctuation
stop = set(stopwords.words('english'))
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try: 
        regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        text = regex.sub(" ", text) # remove punctuation
        
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w)>=3]
        
        return filtered_tokens
            
    except TypeError as e: print(text,e)

In [None]:
comments_tfidf = TfidfVectorizer(min_df = 10, max_features= 50000,tokenizer=tokenize,ngram_range = (1,3))
X_comments = comments_tfidf.fit_transform(df['comment_text']) 

It takes about 7 minutes in my laptop to execute tfidfvectorizer, quite acceptable. We can furthur consider tuning parameters to get better results.

In [11]:
X_comments.shape

(312735, 50000)

#### Here we try a toy baseline model using just the tfidf-matrix(X_comments)
try training 6 binary logistic regressions (one for each label)

In [27]:
X_train = X_comments[:nrow_train]
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X_train.shape

(159571, 50000)

In [38]:
# train models
n_labels = y_train.shape[1]
lr = LogisticRegression()
regressions = [LogisticRegression().fit(X_train, y_train.iloc[:, i]) for i in range(n_labels)]

In [44]:
# make predictions
predictions = np.zeros([X_train.shape[0], n_labels])
for i, regression in enumerate(regressions):
    regression_prediction = regression.predict_proba(X_train)
    predictions[:, i] = regression_prediction[:, regression.classes_ == 1][:, 0]

In [45]:
predictions

array([[ 0.01559351,  0.00257332,  0.00595983,  0.00097346,  0.00698601,
         0.00175674],
       [ 0.00909485,  0.00189685,  0.00318422,  0.00113489,  0.00331545,
         0.00159802],
       [ 0.05446283,  0.00261198,  0.02431941,  0.00118505,  0.02135235,
         0.00213078],
       ..., 
       [ 0.03334593,  0.00292708,  0.0152995 ,  0.00161255,  0.01782044,
         0.00349827],
       [ 0.0227402 ,  0.00144671,  0.0110745 ,  0.00125142,  0.01313697,
         0.00242814],
       [ 0.06069957,  0.00214726,  0.0261893 ,  0.00221813,  0.01860758,
         0.00250948]])