In [1]:
import re
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
WORKING_DIR = os.path.join('.input', 'logistic')

In [3]:
quora = pd.read_csv('../data/.input/train.csv')

In [4]:
quora.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


In [5]:
quora_train, quora_test = train_test_split(quora, random_state=42)
quora_train_data = quora_train.question_text
quora_train_labels = quora_train.target
quora_test_data = quora_test.question_text
quora_test_labels = quora_test.target

In [6]:
%time
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

train_X = vectorizer.fit_transform(quora_train_data)
train_y = quora_train_labels

Wall time: 0 ns


In [7]:
test_X = vectorizer.transform(quora_test_data)
test_y = quora_test_labels

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

params = {
    'n_estimators': sp_randint(5, 200), 
    'min_samples_split': sp_randint(2, 11), 
    'bootstrap': [True, False], 
    'criterion': ["gini", "entropy"]}
clf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(clf, param_distributions=params,
                                   n_iter=20, cv=3, n_jobs=-1)

random_search.fit(train_X, train_y)

In [None]:
clf = random_search.best_estimator_
clf.fit(train_X, train_y)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score, roc_curve

test_predict_proba = clf.predict_proba(test_X)
test_predict_proba_pos = test_predict_proba[:, 1]
auc = roc_auc_score(test_y, test_predict_proba_pos)
curve = roc_curve(test_y, test_predict_proba_pos)


plt.plot(curve[0], curve[1]);
plt.plot([0,1], [0,1])
plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('test AUC = %f' % (auc)); plt.axis([-0.05,1.05,-0.05,1.05]);

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

to_labels = np.vectorize(lambda x: 1 if x > 0.5 else 0)
test_predict = to_labels(test_predict_proba_pos)

print('accuracy', accuracy_score(test_y, test_predict))
print('precision', precision_score(test_y, test_predict))
print('recall', recall_score(test_y, test_predict))
print('f1', f1_score(test_y, test_predict))
confusion_matrix(test_y, test_predict)
