In [0]:
import pandas as pd 
import numpy as np
import pickle
import re
import string
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import shuffle

from google.colab import files

In [0]:
# setting fixed seed value for consistency in results
seed = 7
np.random.seed(seed)

In [0]:
# fetching train and test sets - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data
# reuploaded on Drive for reproducability - https://drive.google.com/file/d/1K0o5SGAYtraa4p0h3qqIe3k1QjdUwJCW/view?usp=sharing

train_data = pd.read_csv('https://drive.google.com/uc?export=view&id=1K0o5SGAYtraa4p0h3qqIe3k1QjdUwJCW')
test_data = pd.read_csv('https://drive.google.com/uc?export=view&id=1kYR1n0runN4MujEo7JN8I0UGLFMRh6aZ')

In [77]:
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [6]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_data['clean'] = 1 - train_data[label_cols].max(axis=1)
train_data.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [0]:
train_data['comment_text'].fillna("unknown", inplace=True)
test_data['comment_text'].fillna("unknown", inplace=True)

In [0]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
  return re_tok.sub(r' \1 ', s).split()

In [0]:
n = train_data.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), 
                      tokenizer=tokenize,
                      min_df=3, 
                      max_df=0.9, 
                      strip_accents='unicode', 
                      use_idf=1,
                      smooth_idf=1, 
                      sublinear_tf=1
                     )

train_term_doc = vec.fit_transform(train_data['comment_text'])
test_term_doc = vec.transform(test_data['comment_text'])


In [32]:
test_term_doc

<153164x383170 sparse matrix of type '<class 'numpy.float64'>'
	with 15115206 stored elements in Compressed Sparse Row format>

In [0]:
X = train_term_doc

def pr(y_i, y):
    p = X[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [0]:
def log_count(y):
  return np.log(pr(1,y) / pr(0,y))

In [0]:
clf = LogisticRegression(C = 2, 
                         dual = True)

In [0]:
test_term_doc = vec.transform(test_data['comment_text'])

In [72]:
# building the model and making predictions

test_x = test_term_doc
preds = np.zeros((len(train_data), len(label_cols)))

for i, j in enumerate(label_cols):
  y = train_data[j].values
  r = log_count(y)
#   X_nb = X.multiply(r)
  X_nb = X
  clf = clf.fit(X_nb, y)
  
  model_name = j+'model.pickle'
  
  pickle.dump(clf, open(model_name, 'wb'))

  from google.colab import files
  files.download(model_name)
  
#   preds[:,i] = clf.predict_proba(X_nb)[:,1]



In [0]:
y = train_data[label_cols[5]].values
r = log_count(y)
X_nb = X.multiply(r)
clf = clf.fit(X_nb, y)
  
model_name = label_cols[5]+'model.pickle'
  
pickle.dump(clf, open(model_name, 'wb'))

files.download(model_name)

In [0]:
# X_nb
# unseen_tf_doc

test_term_doc = vec.transform(test_data['comment_text'])

In [76]:
test_term_doc

<153164x383170 sparse matrix of type '<class 'numpy.float64'>'
	with 15115206 stored elements in Compressed Sparse Row format>

In [79]:
unseen_comment = {
    'comment_text': 'Son of a bitch'
}

# y = train_data[label_cols[5]].values[0:383170]
# r = log_count(y)

unseen_tf_doc = vec.transform([unseen_comment['comment_text']])
unseen_tf_doc

# unseen_tf_doc.multiply(r)
# clf.predict_proba(unseen_tf_doc.multiply(r))
  

<1x383170 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>