In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
train = pd.read_csv('input/train.csv')[:10000]
test = pd.read_csv('input/test.csv')[:10000]
sample = pd.read_csv('input/sample_submission.csv')[:10000]

In [3]:
train.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
import warnings
warnings.simplefilter("ignore", UserWarning)

In [5]:
train['comment_text'] = train['comment_text'].fillna('__nocomment__')
test['comment_text']  = test['comment_text'].fillna('__nocomment__')

train['comment_text'] = train['comment_text'].map(lambda x : x.lower())
test['comment_text'] = test['comment_text'].map(lambda x : x.lower())

In [6]:
def customAuc(yActual, yPred):
    fpr, tpr, __ = metrics.roc_curve(yActual, yPred)
    auc          = metrics.auc(fpr, tpr)
    return auc


columnList = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

xTrain, xValid, yTrain, yValid = train_test_split(train.comment_text.values, train[columnList],
                                                  random_state=42, test_size=0.3, shuffle=True)


In [7]:
#Building Basic Models
#TFIDF Text Vectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=20000, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                      ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')
cfv = TfidfVectorizer(min_df=3,  max_features=50000, strip_accents='unicode', analyzer='char',token_pattern=r'\w{1,}',
                      ngram_range=(2, 4), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')


In [8]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

#tfv.fit(list(xTrain) + list(xValid))
tfv.fit(all_text)
#cfv.fit(all_text)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=3,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [9]:
yTrain.head()

display(yTrain.toxic.value_counts())
display(yTrain.severe_toxic.value_counts())
display(yTrain.obscene.value_counts())
display(yTrain.threat.value_counts())
display(yTrain.insult.value_counts())
display(yTrain.identity_hate.value_counts())


0    6331
1     669
Name: toxic, dtype: int64

0    6928
1      72
Name: severe_toxic, dtype: int64

0    6618
1     382
Name: obscene, dtype: int64

0    6976
1      24
Name: threat, dtype: int64

0    6648
1     352
Name: insult, dtype: int64

0    6931
1      69
Name: identity_hate, dtype: int64

In [10]:
yValid.head()

display(yValid.toxic.value_counts())
display(yValid.severe_toxic.value_counts())
display(yValid.obscene.value_counts())
display(yValid.threat.value_counts())
display(yValid.insult.value_counts())
display(yValid.identity_hate.value_counts())

0    2698
1     302
Name: toxic, dtype: int64

0    2971
1      29
Name: severe_toxic, dtype: int64

0    2855
1     145
Name: obscene, dtype: int64

0    2991
1       9
Name: threat, dtype: int64

0    2858
1     142
Name: insult, dtype: int64

0    2985
1      15
Name: identity_hate, dtype: int64

In [11]:
xTrainTfv = tfv.transform(xTrain)
xValidTfv = tfv.transform(xValid)
xTestTfv  = tfv.transform(test.comment_text.values)

#xTrainCfv = cfv.transform(xTrain)
#xValidCfv = cfv.transform(xValid)
#xTestCfv  = cfv.transform(test.comment_text.values)

#xTrainStack = hstack([xTrainTfv, xTrainCfv])
#xValidStack = hstack([xValidTfv, xValidCfv])
#xTestStack  = hstack([xTestTfv, xTestCfv])

In [12]:
xTestTfv.shape


(10000, 20000)

In [13]:
# Fitting a simple Logistic Regression on TFIDF
# --- Parameter Tuning ---
# --- Added Class Weight : Score Improved ---
# --- Changed Penalty to l1 : No improvement ---
# --- CV : No improvement ---

predValid = pd.DataFrame()
predTest = pd.DataFrame()
predTest.loc[:, 'id'] = sample['id']
clf = LogisticRegression(C= 1, class_weight='balanced')
#cList = [0.001, 0.01, 0.1, 1, 10, 100] 
#clf = LogisticRegressionCV(Cs=cList, class_weight='balanced')
#model = GridSearchCV(estimator=clf, param_grid=param_grid,
#                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)
for col in columnList:
    clf.fit(xTrainTfv, yTrain[col])
    #clf.fit(xTrainStack, yTrain[col])
    print('Fitting ...', col)
    #print('Optimal Value of C ...:', clf.C_)
    #print(clf.predict_proba(xTrainTfv)[:,1])
    predValid.loc[:, col] = clf.predict_proba(xValidTfv)[:,1]
    predTest.loc[:, col] = clf.predict_proba(xTestTfv)[:,1]
    #print(predValid.shape, predTest.shape)

logLossValid = []
for col in columnList:
    ll = customAuc(yValid[col], predValid[col])
    logLossValid.append(ll)


print('Logistic AUC :', np.mean(logLossValid))

Fitting ... toxic
Fitting ... severe_toxic
Fitting ... obscene
Fitting ... threat
Fitting ... insult
Fitting ... identity_hate
Logistic AUC : 0.967166154471
