In [0]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [0]:
train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

In [0]:
train_text = train['comment_text']
test_text = test['comment_text']

In [0]:
class_names = list(train)[2:]

In [0]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf = True,
    strip_accents = 'unicode',
    analyzer = 'word',
    token_pattern = r'\w{1,}',
    stop_words = 'english',
    ngram_range = (1, 1),
    max_features = 10000)

In [0]:
train_word_features = word_vectorizer.fit_transform(train_text)

In [7]:
train_word_features

<159571x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 3681492 stored elements in Compressed Sparse Row format>

In [0]:
test_word_features = word_vectorizer.transform(test_text)

In [0]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})

In [20]:
for class_name in class_names:
  train_target = train[class_name]
  classifier = LogisticRegression(C=0.1, solver='sag')

  cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, cv=3, scoring='roc_auc'))
  scores.append(cv_score)
  print('CV score for class {} is {}'.format(class_name, cv_score))

  classifier.fit(train_word_features, train_target)
  submission[class_name] = classifier.predict_proba(test_word_features)[:, 1] # only take the probabilities of being class 1 (YES class)

CV score for class toxic is 0.9583375820792469
CV score for class severe_toxic is 0.9846172710595207
CV score for class obscene is 0.9807977355773619
CV score for class threat is 0.976800494609043
CV score for class insult is 0.9699894937133021
CV score for class identity_hate is 0.9674462884710441


In [31]:
print(scores) # mean CV score of each label in a list

[0.9583376568539411, 0.9583375820792469, 0.9846172710595207, 0.9807977355773619, 0.976800494609043, 0.9699894937133021, 0.9674462884710441]


In [32]:
print('Total CV score is {}'.format(np.mean(scores)))

Total CV score is 0.9709037889090657


In [0]:
submission.to_csv('submission.csv', index=False)