In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 240)

In [2]:
df = pd.read_csv('train.csv', index_col = 'qid')

In [3]:
from sklearn.model_selection import train_test_split
X, Y = df.question_text, df.target
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

In [4]:
from sklearn import metrics
def maxFscore(estimator, X, y):
    scores = estimator.predict_proba(X)
    pr, re, th = metrics.precision_recall_curve(y_test, scores[:,1])
    pr, re, th = pr[:-2], re[:-2], th[:-1]
    fs = 2*np.divide(np.multiply(pr, re), np.add(pr, re))
    return np.max(fs)

In [10]:
from time import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline

parameters = {
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
    'vect__binary': (True, False),
    'vect__stop_words': ['english'],
    
    'lr__C': (1e-2, 1e-1, 1, 10)
}
pgrid = ParameterGrid(parameters)

Automatically created module for IPython interactive environment


In [11]:
print("Performing grid search...")
print("parameters:")
pprint(parameters)
t0 = time()
best_fscore = 0.0
best_params = None
results = []

for p in pgrid:
    print("testing: ", p)
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('lr', LogisticRegression()),
    ]).set_params(**p)
    pipeline.fit(X_train, y_train)
    fscore_p = maxFscore(pipeline, X_test, y_test)
    results.append((p, fscore_p))
    if fscore_p > best_fscore:
        best_params = p
        best_fscore = fscore_p
        print("new best fscore: ", best_fscore)
        
print("done in %0.3fs" % (time() - t0))

Performing grid search...
parameters:
{'lr__C': (0.01, 0.1, 1, 10),
 'vect__binary': (True, False),
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vect__stop_words': ['english']}
testing:  {'lr__C': 0.01, 'vect__binary': True, 'vect__ngram_range': (1, 1), 'vect__stop_words': 'english'}
new best fscore:  0.5551907374605781
testing:  {'lr__C': 0.01, 'vect__binary': True, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}
new best fscore:  0.5608747428818881
testing:  {'lr__C': 0.01, 'vect__binary': True, 'vect__ngram_range': (1, 3), 'vect__stop_words': 'english'}
new best fscore:  0.561124588459539
testing:  {'lr__C': 0.01, 'vect__binary': False, 'vect__ngram_range': (1, 1), 'vect__stop_words': 'english'}
testing:  {'lr__C': 0.01, 'vect__binary': False, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}
new best fscore:  0.5616273056369591
testing:  {'lr__C': 0.01, 'vect__binary': False, 'vect__ngram_range': (1, 3), 'vect__stop_words': 'english'}
new best fscore:  0.

In [14]:
pl = Pipeline([
        ('vect', CountVectorizer()),
        ('lr', LogisticRegression()),
    ]).set_params(**best_params)

In [15]:
pl.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        s...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [16]:
scores = pl.predict_proba(X_test)
pr, re, th = metrics.precision_recall_curve(y_test, scores[:,1])
pr, re, th = pr[:-2], re[:-2], th[:-1]
fs = 2*np.divide(np.multiply(pr, re), np.add(pr, re))
opt_th = th[np.argmax(fs)]

In [19]:
y_pred = scores[:,1] > opt_th
cfm = metrics.confusion_matrix(y_test, y_pred)

In [20]:
cfm

array([[295580,  10674],
       [  6334,  13943]])

In [22]:
results

[({'lr__C': 0.01,
   'vect__binary': True,
   'vect__ngram_range': (1, 1),
   'vect__stop_words': 'english'},
  0.5551907374605781),
 ({'lr__C': 0.01,
   'vect__binary': True,
   'vect__ngram_range': (1, 2),
   'vect__stop_words': 'english'},
  0.5608747428818881),
 ({'lr__C': 0.01,
   'vect__binary': True,
   'vect__ngram_range': (1, 3),
   'vect__stop_words': 'english'},
  0.561124588459539),
 ({'lr__C': 0.01,
   'vect__binary': False,
   'vect__ngram_range': (1, 1),
   'vect__stop_words': 'english'},
  0.5563386297625149),
 ({'lr__C': 0.01,
   'vect__binary': False,
   'vect__ngram_range': (1, 2),
   'vect__stop_words': 'english'},
  0.5616273056369591),
 ({'lr__C': 0.01,
   'vect__binary': False,
   'vect__ngram_range': (1, 3),
   'vect__stop_words': 'english'},
  0.5617302493241213),
 ({'lr__C': 0.1,
   'vect__binary': True,
   'vect__ngram_range': (1, 1),
   'vect__stop_words': 'english'},
  0.5995827819026043),
 ({'lr__C': 0.1,
   'vect__binary': True,
   'vect__ngram_range': (1