In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

## Data Exploration

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [9]:
len(test)

153164

In [10]:
len(train)

159571

In [6]:
len_of_rows = len(train)*1.0 
print "total number of training rows",len_of_rows
print "total number of test rows",len(test)

total number of training rows 159571.0
total number of test rows 153164


In [7]:
for class_label in class_names:
    print "percentage of comments with class {} is {}".format(class_label, train[class_label].sum()/len_of_rows*100)

"""Observation: few classes like threat, hate etc 
have severe class imbalance, we might need to address this"""

# How?

percentage of comments with class toxic is 9.58444830201
percentage of comments with class severe_toxic is 0.999555056997
percentage of comments with class obscene is 5.29482174079
percentage of comments with class threat is 0.299553176956
percentage of comments with class insult is 4.9363606169
percentage of comments with class identity_hate is 0.880485802558


'Observation: few classes like threat, hate etc \nhave severe class imbalance, we might need to address this'

In [8]:
train[train['toxic'] == 1].head()

"""Observation: Classes are not exclusive"""

# How?

'Observation: Classes are not exclusive'

## Feature Extraction

In [9]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [10]:
word_n_gram_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 3),
    max_features=10000)


""" Consider unigram, bigram and trigram words while creating the TF-IDF based document term matrix, 
remove stopwords, retain only 10000 top words"""

' Consider unigram, bigram and trigram words while creating the TF-IDF based document term matrix, \nremove stopwords, retain only 10000 top words'

In [11]:
word_n_gram_vectorizer.fit(all_text)

""" We will create the DTM on all documents so that we don't miss any words from the dictionary """

" We will create the DTM on all documents so that we don't miss any words from the dictionary "

In [12]:
train_features = word_n_gram_vectorizer.transform(train_text)

"""train feature set"""

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


'train feature set'

In [13]:
test_features = word_n_gram_vectorizer.transform(test_text)

"""test feature set"""

'test feature set'

## Modeling without adressing class imbalance

In [14]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    classifier.fit(train_features, train_target)
    prediction_on_training = classifier.predict(train_features)
    """Detailed classification report for each class"""
    print(classification_report(train_target, prediction_on_training))
    
    
    """ predicting the probablity on test set for submission"""
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]
    

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission.csv', index=False)



"""Observation: Recall for the classes of interest if very low, 
we need to invest more time in feature extraction and modelling """

CV score for class toxic is 0.957901486377
             precision    recall  f1-score   support

          0       0.94      1.00      0.97    144277
          1       0.98      0.39      0.56     15294

avg / total       0.94      0.94      0.93    159571

CV score for class severe_toxic is 0.983996016804
             precision    recall  f1-score   support

          0       0.99      1.00      1.00    157976
          1       0.54      0.07      0.13      1595

avg / total       0.99      0.99      0.99    159571

CV score for class obscene is 0.980180866174
             precision    recall  f1-score   support

          0       0.97      1.00      0.98    151122
          1       0.97      0.43      0.59      8449

avg / total       0.97      0.97      0.96    159571

CV score for class threat is 0.978334229857


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00    159093
          1       0.00      0.00      0.00       478

avg / total       0.99      1.00      1.00    159571

CV score for class insult is 0.969819873914
             precision    recall  f1-score   support

          0       0.97      1.00      0.98    151694
          1       0.87      0.32      0.47      7877

avg / total       0.96      0.96      0.96    159571

CV score for class identity_hate is 0.96813275868
             precision    recall  f1-score   support

          0       0.99      1.00      1.00    158166
          1       0.67      0.02      0.04      1405

avg / total       0.99      0.99      0.99    159571

Total CV score is 0.973060871968


'Observation: Recall for the classes of interest if very low, \nwe need to invest more time in feature extraction and modelling '

## Modeling with Class Weights to address class imbalance

In [15]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag', class_weight="balanced")

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    classifier.fit(train_features, train_target)
    prediction_on_training = classifier.predict(train_features)
    """Detailed classification report for each class"""
    print(classification_report(train_target, prediction_on_training))
    
    
    """ predicting the probablity on test set for submission"""
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]
    

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission.csv', index=False)



"""Observation: Recall has imporoved quite a bit
just by adding class weights proportionate, to class frequencies """

CV score for class toxic is 0.962312144152
             precision    recall  f1-score   support

          0       0.99      0.94      0.96    144277
          1       0.60      0.88      0.71     15294

avg / total       0.95      0.93      0.94    159571





CV score for class severe_toxic is 0.985191406134
             precision    recall  f1-score   support

          0       1.00      0.97      0.99    157976
          1       0.25      0.98      0.40      1595

avg / total       0.99      0.97      0.98    159571

CV score for class obscene is 0.982842462534
             precision    recall  f1-score   support

          0       1.00      0.97      0.98    151122
          1       0.66      0.92      0.77      8449

avg / total       0.98      0.97      0.97    159571

CV score for class threat is 0.959992548284
             precision    recall  f1-score   support

          0       1.00      0.91      0.96    159093
          1       0.03      0.95      0.06       478

avg / total       1.00      0.91      0.95    159571

CV score for class insult is 0.973818635561
             precision    recall  f1-score   support

          0       1.00      0.96      0.98    151694
          1       0.52      0.91      0.66      7877

avg / total

'Observation: Recall has imporoved quite a bit\njust by adding class weights proportionate, to class frequencies '

In [19]:
import pickle

In [35]:
type(classifier)

sklearn.linear_model.logistic.LogisticRegression

In [36]:
import pickle

In [77]:
with open('classifier.pkl', 'w') as f:
    pickle.dump(classifier,f)
with open('classifier.pkl', 'r') as fr:
    pikl_obj = pickle.load(fr)
    print pikl_obj.predict_proba(train_features)

[[0.93177029 0.06822971]
 [0.95762334 0.04237666]
 [0.98221175 0.01778825]
 ...
 [0.93909784 0.06090216]
 [0.9640783  0.0359217 ]
 [0.97046664 0.02953336]]
