In [1]:
# Import data using get_data.py module 
from get_data import load_data
import numpy as np

In [2]:
# Load the data
X_train, X_val, X_test, y_train, y_val, y_test = load_data()

In [3]:
# Conerting text to TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(decode_error='ignore',stop_words='english')
train_tfidf = vect.fit_transform(X_train)
val_tfidf = vect.transform(X_val)
test_tfidf = vect.transform(X_test)

In [4]:
# Creating empty prediction array
col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
pred_train = np.zeros((X_train.shape[0],len(col)))
pred_test = np.zeros((X_test.shape[0],len(col)))
pred_val = np.zeros((X_val.shape[0],len(col)))

In [5]:
# Load Logistic Regression model
from sklearn.linear_model import LogisticRegression
LogR = LogisticRegression()

In [6]:
# Predict on train, val and test datasets
for i,x in enumerate(col):
    LogR.fit(train_tfidf, y_train[x])
    pred_train[:,i] = LogR.predict_proba(train_tfidf)[:,1]
    pred_val[:,i] = LogR.predict_proba(val_tfidf)[:,1]
    pred_test[:,i] = LogR.predict_proba(test_tfidf)[:,1]
    print(x,"predicted!")

toxic predicted!
severe_toxic predicted!
obscene predicted!
threat predicted!
insult predicted!
identity_hate predicted!


In [7]:
from sklearn import metrics
for i,x in enumerate(col):
    print(x,"Train AUC:",metrics.roc_auc_score(y_train[x], pred_train[:,i]))
    print(x,"Val AUC:",metrics.roc_auc_score(y_val[x], pred_val[:,i]))
    print(x,"Test AUC:",metrics.roc_auc_score(y_test[x], pred_test[:,i]))

toxic Train AUC: 0.985669145484
toxic Val AUC: 0.969281957227
toxic Test AUC: 0.967624722551
severe_toxic Train AUC: 0.992689695181
severe_toxic Val AUC: 0.985720379516
severe_toxic Test AUC: 0.981633391905
obscene Train AUC: 0.993966841859
obscene Val AUC: 0.982284100316
obscene Test AUC: 0.987497090156
threat Train AUC: 0.9956640451
threat Val AUC: 0.971606244941
threat Test AUC: 0.981785843241
insult Train AUC: 0.988622465591
insult Val AUC: 0.973958306984
insult Test AUC: 0.975756320238
identity_hate Train AUC: 0.991503895837
identity_hate Val AUC: 0.975949369693
identity_hate Test AUC: 0.965025580003


In [8]:
fpr, tpr, thresholds = metrics.roc_curve(y_train[x], pred_train[:,i], pos_label=2)
print(col[i]," Train AUC:",metrics.roc_auc_score(y_test[x], pred_test[:,i]))

identity_hate  Train AUC: 0.965025580003


