In [3]:
import pandas as pd
import glob
from sklearn.feature_extraction.text import CountVectorizer

## Reading in data

Let's start out with the train / dev subsets proposed by the authors. 

In [2]:
train = pd.read_csv('bias_data/WNC/biased.word.train', sep='\t', header=None)
dev = pd.read_csv('bias_data/WNC/biased.word.dev', sep='\t', header=None)
test = pd.read_csv('bias_data/WNC/biased.word.test', sep='\t', header=None)

In [3]:
print("Train length: {}".format(len(train)))
print("Dev length: {}".format(len(dev)))
print("Test length: {}".format(len(test)))

Train length: 53803
Dev length: 700
Test length: 1000


In [4]:
def read_process_data(train_path, dev_path):
    train = pd.read_csv(train_path, sep='\t', header=None)
    dev = pd.read_csv(dev_path, sep = '\t', header=None)
    
    print("Train length: {}".format(len(train)))
    print("Dev length: {}".format(len(dev)))
    
    #Rename columns
    train.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
    dev.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
    
    #Process and recombine training data: 
    train_biased, dev_biased = pd.DataFrame(train.biased), pd.DataFrame(dev.biased)
    train_unbiased, dev_unbiased = pd.DataFrame(train.unbiased), pd.DataFrame(dev.unbiased)
    
    
    train_biased['label'], dev_biased['label'] = [1]*len(train_biased), [1]*len(dev_biased)
    train_unbiased['label'], dev_unbiased['label'] = [0]*len(train_unbiased), [0]*len(dev_unbiased)

    #Fix colnames
    train_biased.columns, dev_biased.columns = ['text', 'label'], ['text', 'label']
    train_unbiased.columns, dev_unbiased.columns = ['text', 'label'], ['text', 'label']
    #Combine
    train_all, dev_all = pd.concat([train_biased, train_unbiased]), pd.concat([dev_biased, dev_unbiased])
    
    return train_all, dev_all

In [5]:
train, dev = read_process_data('bias_data/WNC/biased.word.train', 'bias_data/WNC/biased.word.dev')

Train length: 53803
Dev length: 700


In [79]:
train.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
dev.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']

In [80]:
train_biased = pd.DataFrame(train.biased)
train_unbiased = pd.DataFrame(train.unbiased)

#Add labels
train_biased['label'] = [1]*len(train_biased)
train_unbiased['label'] = [0]*len(train_unbiased)

#Fix colnames
train_biased.columns = ['text', 'label']
train_unbiased.columns = ['text', 'label']

In [81]:
#Shmush
train_all = pd.concat([train_biased, train_unbiased])

In [104]:
dev

Unnamed: 0,text,label
0,in addition to sponsoring palestinian terror a...,1
1,the game is currently played in 47 countries w...,1
2,no part of the valley lies in the area current...,1
3,scholars perceived that it was discordant with...,1
4,"since the chinese civil war in 1949, taiwan ha...",1
...,...,...
695,in 2008 five pharmaceutical companies received...,0
696,"the palm, a steakhouse restaurant chain origin...",0
697,d.c. united's early successes,0
698,on 29 june 2007 price gave birth to her third ...,0


In [None]:
dev_biased = pd.DataFrame(dev.biased)

# Experimentation on the smaller, subset
## Construct bag-of-words representations.

In [105]:
#Initialize vectorizer
vectorizer = CountVectorizer()

In [106]:
X_train = vectorizer.fit_transform(train.text)
y_train = train.label

X_dev = vectorizer.transform(dev.text)
y_dev = dev.label

In [109]:
print(X_train.shape)
print(X_dev.shape)


(107606, 79211)
(1400, 79211)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

### Attempt 1A: Plain bag of words into logistic regression

In [116]:
#Was running out of iterations hahaa
lr = LogisticRegression(max_iter=1000)

In [117]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [126]:
lr.predict(X_train)

array([1, 1, 1, ..., 0, 1, 0])

In [129]:
print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.77603479359887
Val set accuracy: 0.6814285714285714


### Attempt 1B: Plain bag of words into random forest
Notice the SEVERE overfitting here, which is strange given the model used. 

In [131]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [132]:
print("Training set accuracy: {}".format(rf.score(X_train, y_train)))
print("Val set accuracy: {}".format(rf.score(X_dev, y_dev)))

Training set accuracy: 0.9942103600170994
Val set accuracy: 0.6757142857142857


### Attempt 2: TF-IDF

In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf

In [135]:
tfidf = Tfidf()
X_train = tfidf.fit_transform(train.text)
y_train = train.label

X_dev = tfidf.transform(dev.text)
y_dev = dev.label

### Attempt 2a: TF-IDF Logistic Regression

In [136]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
#This one finishes in way fewer iterations! Interesting

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [138]:
print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.706196680482501
Val set accuracy: 0.6842857142857143


### Attempt 2b: TF-IDF Random Forest
Still overfitting, which makes sense...

In [139]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [140]:
print("Training set accuracy: {}".format(rf.score(X_train, y_train)))
print("Val set accuracy: {}".format(rf.score(X_dev, y_dev)))

Training set accuracy: 0.9942475326654647
Val set accuracy: 0.6564285714285715


## Attempt 3: Using embeddings?

Plain rf on the data was taking way too long, imo

In [9]:
from zeugma.embeddings import EmbeddingTransformer
from sklearn.
glove = EmbeddingTransformer('glove')
X_train = glove.transform(train.text)
y_train = train.label


In [10]:
X_dev = glove.transform(dev.text)
y_dev = dev.label

In [13]:
rf = RandomForestClassifier()

In [16]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
rf.score(X_dev, y_dev)

0.5457142857142857

In [19]:
lr = LogisticRegression().fit(X_train,y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_dev, y_dev))

0.5556381614408118
0.5557142857142857


## It appears that using GloVe embeddings, while it speeds computation time for random forest, degrades accuracy a LOT
Looks like tf-idf with logistic hecking regression is the winner. Let's tune 'er up

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf

tfidf = Tfidf()
X_train = tfidf.fit_transform(train.text)
y_train = train.label

X_dev = tfidf.transform(dev.text)
y_dev = dev.label

### Attempt 2a: TF-IDF Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
#This one finishes in way fewer iterations! Interesting

print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.706196680482501
Val set accuracy: 0.6842857142857143


# Data preparation for the full dataset


In [58]:
full = pd.read_csv('bias_data/WNC/biased.full', sep='\t', header=None, error_bad_lines=False)

b'Skipping line 60908: expected 7 fields, saw 9\n'


In [59]:
full.head()[4][3]

'dennis the menace is an american animated series produced by dic entertainment, based on the comic strip by hank ketcham.'

In [60]:
full_biased = pd.DataFrame(full[3])
full_unbiased = pd.DataFrame(full[4])

In [61]:
full_biased['label'] = [1]*len(full_biased)
full_unbiased['label'] = [0]*len(full_unbiased)


full_biased.columns = ['text', 'label']
full_unbiased.columns = ['text', 'label']

In [62]:
full = pd.concat([full_biased, full_unbiased])

In [63]:
full.head()

Unnamed: 0,text,label
0,"during the campaign, controversy erupted over ...",1
1,nicaea was convoked by the emperor constantine...,1
2,it was rather unfortunate that he vehemently o...,1
3,dennis the menace is an american animated seri...,1
4,"today, on large farms, motorcycles, dogs or me...",1


In [64]:
full.tail()

Unnamed: 0,text,label
181468,"arguably the most notable was dale earnhardt, ...",0
181469,because of the genetic prepotency of the ancie...,0
181470,"influenced by serge gainsbourg, the velvet und...",0
181471,"northern indiana, for example, contains the in...",0
181472,mythology is alive and well in the modern age ...,0


In [67]:
#Shuffle full training set
full = full.sample(random_state=42, frac=1, replace=False)

In [74]:
full_train = full.iloc[0:308504]
full_dev = full.iloc[308504:326651]
full_test = full.iloc[326651:]

In [75]:
full_train.to_csv('processed_data/full_train.csv', index=False, header=False)
full_dev.to_csv('processed_data/full_dev.csv', index=False, header=False)
full_test.to_csv('processed_data/full_test.csv', index=False, header=False)

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf

tfidf = Tfidf()
X_train = tfidf.fit_transform(full_train.text)
y_train = full_train.label

X_dev = tfidf.transform(full_dev.text)
y_dev = full_dev.label

### Attempt 2a: TF-IDF Logistic Regression

lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
#This one finishes in way fewer iterations! Interesting

print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.6789928169488888
Val set accuracy: 0.5785529288587645


In [80]:
X_train = tfidf.fit_transform(train.text)
y_train = train.label

X_dev = tfidf.transform(dev.text)
y_dev = dev.label

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.706196680482501
Val set accuracy: 0.6842857142857143


In [93]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tfidf', Tfidf()),
    ('clf', LogisticRegression(max_iter=1000))
])


parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1,2), (1,3)],
    'clf__C': [0.001, 0.01, 0.1, 1, 10]
}

In [94]:
from sklearn.model_selection import GridSearchCV
search = GridSearchCV(pipeline, parameters, verbose=2)
search.fit(train.text, train.label)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.6s remaining:    0.0s


[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=   8.2s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=   8.3s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=   8.2s
[CV] cl

[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3), total=  16.3s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3) ......
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3), total=  16.3s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3) ......
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3), total=  16.8s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3) ......
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3), total=  16.5s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.01,

[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=  10.0s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=  10.5s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=   8.6s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=   8.7s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3), total=  17.2s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3), total=  20.4s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3), total=  22.2s
[CV] clf__C=0.1, tfidf__max_

[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1), total=   3.3s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1), total=   4.8s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1), total=   4.9s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), total=  17.8s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), total=  17.4s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), total=  16.5s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), total=  16.5s
[CV] clf__C=1, tfidf__max_df=0.75, 

[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__ngram_range=(1, 3), total= 1.0min
[CV] clf__C=10, tfidf__max_df=0.75, tfidf__ngram_range=(1, 3) ........
[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__ngram_range=(1, 3), total= 1.3min


[Parallel(n_jobs=1)]: Done 225 out of 225 | elapsed: 57.9min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [99]:
best = search.best_estimator_

In [101]:
best.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=0.25, max_features=None,
                   min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('clf',
   LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=1000,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error=

C = 10, max_iter 100, max-df = 0.25

In [102]:
from sklearn.pipeline import Pipeline
fine_pipe = Pipeline([
    ('tfidf', Tfidf()),
    ('clf', LogisticRegression(max_iter=1000))
])


fine_parameters = {
    'tfidf__max_df': (0.25, 0.375, 0.5),
    'tfidf__ngram_range': [(1,2)],
    'clf__C': [1, 10, 25, 50]
}

In [104]:
fine_search = GridSearchCV(fine_pipe, fine_parameters, verbose=2)
fine_search.fit(train.text, train.label)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  15.0s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.0s remaining:    0.0s


[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  14.3s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  14.9s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  11.3s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  11.3s
[CV] clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2), total=  14.1s
[CV] clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2), total=  13.4s
[CV] clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2), total=  13.0s
[CV] clf__C=1, tfidf__max_df=0.3

[CV]  clf__C=50, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total= 1.1min
[CV] clf__C=50, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=50, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=  57.2s
[CV] clf__C=50, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=50, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=  58.8s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 38.4min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [105]:
fine_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=0.25, max_features=None,
                   min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('clf',
   LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=1000,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error=

In [117]:
fine = fine_search.best_estimator_

## Grid search on the smaller training set

In [112]:
from sklearn.pipeline import Pipeline
final_pipe = Pipeline([
    ('tfidf', Tfidf()),
    ('clf', LogisticRegression(max_iter=1000))
])


final_params = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__min_df': (0.001, 0.01, 0.05),
    'tfidf__ngram_range': [(1, 1), (1,2)],
    'clf__C': [0.01, 0.1, 1, 10]
}

In [113]:
final_grid = GridSearchCV(final_pipe, final_params, verbose=2)
final_grid.fit(train.text, train.label)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.5s remaining:    0.0s


[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.4s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   7.6s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf

[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.7s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.7s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.6s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.4s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   3.0s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), 

[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   6.9s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_ran

[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.7s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.3s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.1s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.9s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.4s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.

[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   8.0s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   7.

[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.8s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.8s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.8s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=1, tfid

[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   3.3s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   3.3s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   9.5s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   9.3s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   9.3s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), 

[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.4s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.8s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed: 30.5min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [115]:
final_grid.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=0.25, max_features=None,
                   min_df=0.001, ngram_range=(1, 2), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('clf',
   LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=1000,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_err

In [116]:
final_grid.best_estimator_.score(dev.text, dev.label)

0.6357142857142857

In [121]:
fine['clf'].coef_

array([[ 0.12384133, -0.04475776,  0.05132599, ...,  0.18047819,
        -0.04571422, -0.04571422]])

In [123]:
from sklearn.pipeline import Pipeline
final_pipe = Pipeline([
    ('tfidf', Tfidf()),
    ('clf', LogisticRegression(max_iter=1000))
])


final_params = {
    'tfidf__max_df': (0.25, 0.5, 0.75, 1),
    'tfidf__min_df': (0, 0.001, 0.01, 0.05),
    'tfidf__ngram_range': [(1, 1), (1,2), (1, 3)],
    'clf__C': [0.01, 0.1, 1, 10]
}

In [124]:
ultimate_grid = GridSearchCV(final_pipe, final_params, verbose=2)
ultimate_grid.fit(train.text, train.label)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.9s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.8s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.8s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.9s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   9.6s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   9.0s
[CV] clf

[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   6.6s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_ran

[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   2.4s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.2s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), 

[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   7.2s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   6.8s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf

[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.4s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   7.1s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   7.0s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   7.0s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf_

ValueError: max_df corresponds to < documents than min_df



[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   1.9s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   1.9s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   1.9s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.0s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.0s
[CV] clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   5.

[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.9s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   3.0s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.9s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   9.4s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   8.9s
[CV] clf__C=0.1, tf

[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   7.3s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   7.4s
[CV] clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), 

[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   2.9s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   2.8s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.7s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.7s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.8s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.

[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.9s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   7.5s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   7.6s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   7.7s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   7.4s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_ran

[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   2.8s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   8.0s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   7.9s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   8.0s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   8.0s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   8.1s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0, t

ValueError: max_df corresponds to < documents than min_df



[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.2s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.2s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.2s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.2s
[CV] clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   6.3s
[CV] clf

[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   4.4s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   4.2s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   4.1s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  18.1s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  17.1s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  17.5s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf_

[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   7.7s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   7.8s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   8.0s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   8.4s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   7.9s
[CV] clf

[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.8s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.7s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.7s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   7.6s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 3), total=  14.7s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 3), total=  14.8s
[CV] clf__C=1, tfid

[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   8.4s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   7.9s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3), total=  15.1s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3), total=  15.3s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3), total=  15.2s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3), total=  15.

[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3), total=  14.7s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3), total=  14.7s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3), total=  14.8s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3), total=  14.7s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3), total=  14.7s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 


ValueError: max_df corresponds to < documents than min_df



[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   5.9s
[CV] clf__C=1, tfidf__max_df=1

[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   7.7s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   8.1s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  26.8s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  33.7s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  31.5s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  15.0s
[CV] clf__C=10, tfidf__max_df=

[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   6.5s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   6.5s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   6.5s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   6.4s
[CV] clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.25, tfidf__min_df=0.05, tfidf__ngram_range=(1, 2), total=   6.

[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.8s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.7s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.8s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.8s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=   6.8s
[CV] clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=10, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 3), total=  13.2s
[CV] clf

[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   9.5s
[CV] clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   8.2s
[CV] clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   8.8s
[CV] clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   9.1s
[CV] clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3), total=  15.1s
[CV] clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3), 

[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   6.6s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   6.6s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=   6.5s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3), total=  13.2s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3), total=  13.2s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_range=(1, 3), total=  13.3s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0, tfidf__ngram_

ValueError: max_df corresponds to < documents than min_df



[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   1.8s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   1.8s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   1.8s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   1.8s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   1.8s
[CV] clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=10, tfidf__max_df=1, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=   5.2s
[CV] clf__C=10, tfi

[Parallel(n_jobs=1)]: Done 960 out of 960 | elapsed: 154.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [126]:
ultimate_grid.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=0.25, max_features=None,
                   min_df=0, ngram_range=(1, 2), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('clf',
   LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=1000,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error=

In [181]:
ultimate_grid.best_estimator_.score(dev.text, dev.label)

0.7021428571428572

# Data preparation for the full dataset


In [58]:
full = pd.read_csv('bias_data/WNC/biased.full', sep='\t', header=None, error_bad_lines=False)

b'Skipping line 60908: expected 7 fields, saw 9\n'


In [59]:
full.head()[4][3]

'dennis the menace is an american animated series produced by dic entertainment, based on the comic strip by hank ketcham.'

In [60]:
full_biased = pd.DataFrame(full[3])
full_unbiased = pd.DataFrame(full[4])

In [61]:
full_biased['label'] = [1]*len(full_biased)
full_unbiased['label'] = [0]*len(full_unbiased)


full_biased.columns = ['text', 'label']
full_unbiased.columns = ['text', 'label']

In [62]:
full = pd.concat([full_biased, full_unbiased])

In [63]:
full.head()

Unnamed: 0,text,label
0,"during the campaign, controversy erupted over ...",1
1,nicaea was convoked by the emperor constantine...,1
2,it was rather unfortunate that he vehemently o...,1
3,dennis the menace is an american animated seri...,1
4,"today, on large farms, motorcycles, dogs or me...",1


In [64]:
full.tail()

Unnamed: 0,text,label
181468,"arguably the most notable was dale earnhardt, ...",0
181469,because of the genetic prepotency of the ancie...,0
181470,"influenced by serge gainsbourg, the velvet und...",0
181471,"northern indiana, for example, contains the in...",0
181472,mythology is alive and well in the modern age ...,0


In [67]:
#Shuffle full training set
full = full.sample(random_state=42, frac=1, replace=False)

In [74]:
full_train = full.iloc[0:308504]
full_dev = full.iloc[308504:326651]
full_test = full.iloc[326651:]

In [75]:
full_train.to_csv('processed_data/full_train.csv', index=False, header=False)
full_dev.to_csv('processed_data/full_dev.csv', index=False, header=False)
full_test.to_csv('processed_data/full_test.csv', index=False, header=False)

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf

tfidf = Tfidf()
X_train = tfidf.fit_transform(full_train.text)
y_train = full_train.label

X_dev = tfidf.transform(full_dev.text)
y_dev = full_dev.label

### Attempt 2a: TF-IDF Logistic Regression

lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
#This one finishes in way fewer iterations! Interesting

print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.6789928169488888
Val set accuracy: 0.5785529288587645


In [80]:
X_train = tfidf.fit_transform(train.text)
y_train = train.label

X_dev = tfidf.transform(dev.text)
y_dev = dev.label

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.706196680482501
Val set accuracy: 0.6842857142857143


In [81]:
X_dev_full = tfidf.transform(full_dev.text)
y_dev_full = full_dev.label

In [82]:
lr.score(X_dev_full, y_dev_full)

0.6361933101890119

# Applying these findings to the full dataset
First, get a baseline for non-hyperparameter tune bag of words and tfidf.

As expected, RandomForestClassifier did not complete, even when run overnight. 
This could possibly be mitigated by performing dimensionality reduction using pretrained embeddings like GloVe or word2vec, but as we saw in our exploration from the earlier set, this resulted in markedly poor (sub-logistic-regression) performance on the small set. 

In [140]:
train_full = pd.read_csv('processed_data/full_train.csv', header=None)
dev_full = pd.read_csv('processed_data/full_dev.csv', header=None)

train_full.columns = ['text', 'label']
dev_full.columns = ['text', 'label']

In [141]:
#Strip punctuation
import re
train_full['text'] = train_full.text.apply(lambda x: re.sub("\W", " ", x).lower())
dev_full['text'] = dev_full.text.apply(lambda x: re.sub("\W", " ", x).lower())

### Baseline logistic regression with CountVectorizer

These results are a bit confusing and concerning in light of the results of our gridsearch on the smaller subset. There, the optimal value of C was found to be 10, which is much higher than (and as such, invokes LESS regularization than) our value of 1 here. This is concerning because 1 seems to be overfitting on the whole dataset, which would suggest that a bit more hyperparameter tuning may be necessary.

In [146]:
basic_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('clf', LogisticRegression(max_iter=2000))
])

In [147]:
basic_pipe.fit(train_full.text, train_full.label)
print("Training accuracy {}".format(basic_pipe.score(train_full.text, train_full.label)))
print("Validation accuracy {}".format(basic_pipe.score(dev_full.text, dev_full.label)))

Training accuracy 0.7282822913155097
Validation accuracy 0.5127569295200308


### Baseline logistic regression with Tfidf

Still a bit of overfitting evident here, but tfidf has drastically improved performance. 

In [148]:
tfidf_pipe = Pipeline([
    ('tfidf', Tfidf()),
    ('clf', LogisticRegression(max_iter=2000))
])

In [149]:
tfidf_pipe.fit(train_full.text, train_full.label)
print("Training accuracy {}".format(tfidf_pipe.score(train_full.text, train_full.label)))
print("Validation accuracy {}".format(tfidf_pipe.score(dev_full.text, dev_full.label)))

Training accuracy 0.6789928169488888
Validation accuracy 0.5785529288587645


### Trying the best hyperparameter combination from our smaller-set experimentation

As predicted above, using our previously-determined "ideal" hyperparameter for C turned out to be vastly insufficient regularization for this model when trained on the entire training set. It looks like we will need to run a (small) gridsearch on this full dataset in order to get better results.

In [155]:
optimized_pipe = Pipeline([
    ('tfidf', Tfidf(ngram_range=(1,2), min_df=0, max_df=0.25)),
    ('clf', LogisticRegression(C=10, penalty="l2", max_iter=2000))
])

In [158]:
optimized_pipe.fit(train_full.text, train_full.label)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.25, max_features=None,
                                 min_df=0, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [159]:
print("Training accuracy {}".format(optimized_pipe.score(train_full.text, train_full.label)))
print("Validation accuracy {}".format(optimized_pipe.score(dev_full.text, dev_full.label)))

Training accuracy 0.932775587998859
Validation accuracy 0.3763707499862236


### Grid search over the whole dataset
Using a strongly reduced parameter grid here because the computational costs are so extensive. 

In [160]:
fullset_pipe = Pipeline([
    ('tfidf', Tfidf()),
    ('clf', LogisticRegression(max_iter=2000))
])


fullset_params = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__min_df': (0, 0.001, 0.01),
    'tfidf__ngram_range': [(1, 1), (1,2)],
    'clf__C': [0.01, 0.1, 1]
}

In [163]:
fullset_grid = GridSearchCV(fullset_pipe, fullset_params, cv=3, verbose=2)

In [164]:
fullset_grid.fit(train_full.text, train_full.label)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   7.8s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.8s remaining:    0.0s


[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   7.6s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   7.6s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  29.2s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  28.3s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  27.1s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   7.0s


[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=  22.4s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 2), total=  23.1s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   6.7s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   6.7s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   6.5s
[CV] clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01, tfidf__ngram_

[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   8.5s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  31.3s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  31.1s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0, tfidf__ngram_range=(1, 2), total=  31.4s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   7.4s
[CV] clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   7.4s
[CV

[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   7.9s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   7.7s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 1), total=   7.7s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=  25.7s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=  25.2s
[CV] clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2) 
[CV]  clf__C=1, tfidf__max_df=0.5, tfidf__min_df=0.01, tfidf__ngram_range=(1, 2), total=  24.8s
[CV] clf__C=1, tfid

[Parallel(n_jobs=1)]: Done 162 out of 162 | elapsed: 49.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [165]:
fullset_grid.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=0.75, max_features=None,
                   min_df=0.001, ngram_range=(1, 1), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('clf',
   LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=2000,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_err

These results make sense in light of the struggles of the past model—while 0.25 max_df was certainly great for reducing dimensionality in the face of millions of bigrams, a higher max_df is needed in the unigram case. Also, moving to the unigram case makes sense, given the likely enormous dimensionality of bigrams in these sentences of almost completely unrelated data. 

In [168]:
fullset_best = fullset_grid.best_estimator_
print("Grid search best estimator TRAIN accuracy: {}".format(fullset_best.score(train_full.text, train_full.label)))
print("Grid search best estimator VAL accuracy: {}".format(fullset_best.score(dev_full.text, dev_full.label)))

Grid search best estimator TRAIN accuracy: 0.6130422944273007
Grid search best estimator VAL accuracy: 0.6017523557612828


This is, by far, the best accuracy we've seen so far! With the extent to which small hyperparameter changes are providing massive improvements, it seems worth it to do one last grid search on a FINER grain around the "approximately-right" hyperparameters we have figured out.

In [176]:
finest_pipe = Pipeline([
    ('tfidf', Tfidf()),
    ('clf', LogisticRegression(max_iter=2000))
])


finest_params = {
    'tfidf__max_df': (0.6, 0.75, 0.9, 0.999),
    'tfidf__min_df': [0, 0.001, 0.005],
    'tfidf__ngram_range': [(1, 1)],
    'clf__C':  [0.5, 1, 2, 5]
}

In [177]:
finest_grid = GridSearchCV(finest_pipe, finest_params, cv=3, verbose=2)
finest_grid.fit(train_full.text, train_full.label)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=  11.5s
[CV] clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.5s remaining:    0.0s


[CV]  clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=  11.5s
[CV] clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=   9.7s
[CV] clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   9.2s
[CV] clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   8.9s
[CV] clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   8.9s
[CV] clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=0.5, tfidf__max_df=0.6, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), total=   7.6

[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=  14.1s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   9.9s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   9.5s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=   8.6s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), total=   8.1s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), total=   8.4s
[

[CV]  clf__C=2, tfidf__max_df=0.9, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=  15.4s
[CV] clf__C=2, tfidf__max_df=0.9, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=2, tfidf__max_df=0.9, tfidf__min_df=0.001, tfidf__ngram_range=(1, 1), total=  10.9s
[CV] clf__C=2, tfidf__max_df=0.9, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=2, tfidf__max_df=0.9, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), total=   8.0s
[CV] clf__C=2, tfidf__max_df=0.9, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=2, tfidf__max_df=0.9, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), total=   9.0s
[CV] clf__C=2, tfidf__max_df=0.9, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=2, tfidf__max_df=0.9, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), total=   8.5s
[CV] clf__C=2, tfidf__max_df=0.999, tfidf__min_df=0, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=2, tfidf__max_df=0.999, tfidf__min_df=0, tfidf__ngram_range=(1, 1), total=  17.1s
[CV] clf__C=

[CV]  clf__C=5, tfidf__max_df=0.999, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), total=   7.6s
[CV] clf__C=5, tfidf__max_df=0.999, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=5, tfidf__max_df=0.999, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), total=   7.8s
[CV] clf__C=5, tfidf__max_df=0.999, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1) 
[CV]  clf__C=5, tfidf__max_df=0.999, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), total=   7.9s


[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 27.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [178]:
finest = finest_grid.best_estimator_

In [179]:
finest.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=0.9, max_features=None,
                   min_df=0.001, ngram_range=(1, 1), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('clf',
   LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=2000,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_er

In [180]:
print("Fine grid search best estimator TRAIN accuracy: {}".format(finest.score(train_full.text, train_full.label)))
print("Fine search best estimator VAL accuracy: {}".format(finest.score(dev_full.text, dev_full.label)))

Fine grid search best estimator TRAIN accuracy: 0.6126889764800456
Fine search best estimator VAL accuracy: 0.6019727778696203


# Interpreting model coefficients
We will do this both for the smaller and larger models, i.e. the one-word-only model as well as the any-change model.

In [None]:
#Smaller model
small_tfidf = ultimate_grid.best_estimator_['tfidf']
small_lr = ultimate_grid.best_estimator_['clf']

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
from sklearn.linear_model import LogisticRegression
small_tfidf = Tfidf(max_df=0.25, min_df=0, ngram_range=(1,2)).fit(train.text)
small_lr = LogisticRegression(C=10, max_iter=2000).fit(small_tfidf.transform(train.text), train.label)

In [7]:
#Get vocabulary
small_vocab = small_tfidf.vocabulary_


#Refit the model, this time with standardized parameters
X_small = small_tfidf.transform(train.text)

In [10]:
#Use a hack to get the column-wise variance of a sparse matrix: mean of square minus squared mean,
c = X_small.copy();
c.data **=2
square_means = c.mean(0)
mean_square = X_small.mean(0)

In [31]:
import numpy as np
mean_square = mean_square.reshape(mean_square.shape[1],)

In [39]:
variances = square_means - np.multiply(mean_square, mean_square) 
stdevs = np.sqrt(variances)

In [40]:
#Now, get coefficients from small_lr. 

coeffs = small_lr.coef_

In [44]:
#Standardize the coefficients by dividing through by the standard deviation...
coeffs / stdevs

matrix([[ 37.32641704, -51.89403094,  55.85468215, ..., 243.12130867,
         -36.82628071, -36.82628071]])

In [96]:
coef_df = pd.DataFrame((coeffs/stdevs).flatten()).T.reset_index(drop=False)

In [100]:
small_ivocab = {val:key for key, val in small_vocab.items()}

In [106]:
res = []
for i in range(len(coef_df)):
    res.append(small_ivocab[i])

In [107]:
coef_df['word'] = res

In [113]:
coef_df['abs_score'] = coef_df[0].apply(abs)

In [116]:
coef_df.sort_values('abs_score', ascending=False).head(20)

Unnamed: 0,index,0,word,abs_score
231854,231854,4333.787541,exquisite,4333.787541
712352,712352,4293.934045,wisely,4293.934045
237089,237089,4251.358508,fascinating,4251.358508
329938,329938,4172.149638,infamously,4172.149638
602622,602622,4112.839867,stunning,4112.839867
544128,544128,4059.141778,rightfully,4059.141778
552365,552365,4057.730319,sadly,4057.730319
250088,250088,4000.752426,foolishly,4000.752426
391374,391374,3987.320827,magnificent,3987.320827
82951,82951,3983.399969,astonishing,3983.399969


In [119]:
coef_df.sort_values('abs_score', ascending=False).tail(20)

Unnamed: 0,index,0,word,abs_score
74813,74813,0.007784,around grove,0.007784
711087,711087,0.007784,wilts and,0.007784
711086,711086,0.007784,wilts,0.007784
104811,104811,0.007784,berks,0.007784
104812,104812,0.007784,berks canal,0.007784
641767,641767,0.007784,the wilts,0.007784
384505,384505,0.007784,local walks,0.007784
690198,690198,0.007784,visible providing,0.007784
53216,53216,0.007784,and berks,0.007784
567315,567315,0.007177,services civil,0.007177
