In [2]:
import pandas as pd
import glob
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
glob.glob('bias_data/WNC/*')

['bias_data/WNC/biased.word.dev',
 'bias_data/WNC/revision_topics.csv',
 'bias_data/WNC/biased.word.test',
 'bias_data/WNC/neutral',
 'bias_data/WNC/biased.word.train',
 'bias_data/WNC/biased.full']

## Reading in data

Let's start out with the train / dev subsets proposed by the authors. 

In [3]:
train = pd.read_csv('bias_data/WNC/biased.word.train', sep='\t', header=None)
dev = pd.read_csv('bias_data/WNC/biased.word.dev', sep='\t', header=None)
test = pd.read_csv('bias_data/WNC/biased.word.test', sep='\t', header=None)

In [4]:
print("Train length: {}".format(len(train)))
print("Dev length: {}".format(len(dev)))
print("Test length: {}".format(len(test)))

Train length: 53803
Dev length: 700
Test length: 1000


In [5]:
def read_process_data(train_path, dev_path):
    train = pd.read_csv(train_path, sep='\t', header=None)
    dev = pd.read_csv(dev_path, sep = '\t', header=None)
    
    print("Train length: {}".format(len(train)))
    print("Dev length: {}".format(len(dev)))
    
    #Rename columns
    train.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
    dev.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
    
    #Process and recombine training data: 
    train_biased, dev_biased = pd.DataFrame(train.biased), pd.DataFrame(dev.biased)
    train_unbiased, dev_unbiased = pd.DataFrame(train.unbiased), pd.DataFrame(dev.unbiased)
    
    
    train_biased['label'], dev_biased['label'] = [1]*len(train_biased), [1]*len(dev_biased)
    train_unbiased['label'], dev_unbiased['label'] = [0]*len(train_unbiased), [0]*len(dev_unbiased)

    #Fix colnames
    train_biased.columns, dev_biased.columns = ['text', 'label'], ['text', 'label']
    train_unbiased.columns, dev_unbiased.columns = ['text', 'label'], ['text', 'label']
    #Combine
    train_all, dev_all = pd.concat([train_biased, train_unbiased]), pd.concat([dev_biased, dev_unbiased])
    
    return train_all, dev_all

In [6]:
train, dev = read_process_data('bias_data/WNC/biased.word.train', 'bias_data/WNC/biased.word.dev')

Train length: 53803
Dev length: 700


In [79]:
train.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
dev.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']

In [80]:
train_biased = pd.DataFrame(train.biased)
train_unbiased = pd.DataFrame(train.unbiased)

#Add labels
train_biased['label'] = [1]*len(train_biased)
train_unbiased['label'] = [0]*len(train_unbiased)

#Fix colnames
train_biased.columns = ['text', 'label']
train_unbiased.columns = ['text', 'label']

In [81]:
#Shmush
train_all = pd.concat([train_biased, train_unbiased])

In [104]:
dev

Unnamed: 0,text,label
0,in addition to sponsoring palestinian terror a...,1
1,the game is currently played in 47 countries w...,1
2,no part of the valley lies in the area current...,1
3,scholars perceived that it was discordant with...,1
4,"since the chinese civil war in 1949, taiwan ha...",1
...,...,...
695,in 2008 five pharmaceutical companies received...,0
696,"the palm, a steakhouse restaurant chain origin...",0
697,d.c. united's early successes,0
698,on 29 june 2007 price gave birth to her third ...,0


In [None]:
dev_biased = pd.DataFrame(dev.biased)

## Construct bag-of-words representations.

In [105]:
#Initialize vectorizer
vectorizer = CountVectorizer()

In [106]:
X_train = vectorizer.fit_transform(train.text)
y_train = train.label

X_dev = vectorizer.transform(dev.text)
y_dev = dev.label

In [109]:
print(X_train.shape)
print(X_dev.shape)


(107606, 79211)
(1400, 79211)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

### Attempt 1A: Plain bag of words into logistic regression

In [116]:
#Was running out of iterations hahaa
lr = LogisticRegression(max_iter=1000)

In [117]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [126]:
lr.predict(X_train)

array([1, 1, 1, ..., 0, 1, 0])

In [129]:
print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.77603479359887
Val set accuracy: 0.6814285714285714


### Attempt 1B: Plain bag of words into random forest
Notice the SEVERE overfitting here, which is strange given the model used. 

In [131]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [132]:
print("Training set accuracy: {}".format(rf.score(X_train, y_train)))
print("Val set accuracy: {}".format(rf.score(X_dev, y_dev)))

Training set accuracy: 0.9942103600170994
Val set accuracy: 0.6757142857142857


### Attempt 2: TF-IDF

In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf

In [135]:
tfidf = Tfidf()
X_train = tfidf.fit_transform(train.text)
y_train = train.label

X_dev = tfidf.transform(dev.text)
y_dev = dev.label

### Attempt 2a: TF-IDF Logistic Regression

In [136]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
#This one finishes in way fewer iterations! Interesting

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [138]:
print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.706196680482501
Val set accuracy: 0.6842857142857143


### Attempt 2b: TF-IDF Random Forest
Still overfitting, which makes sense...

In [139]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [140]:
print("Training set accuracy: {}".format(rf.score(X_train, y_train)))
print("Val set accuracy: {}".format(rf.score(X_dev, y_dev)))

Training set accuracy: 0.9942475326654647
Val set accuracy: 0.6564285714285715


In [142]:
pd.Series(y_dev).value_counts()

1    700
0    700
Name: label, dtype: int64

In [1]:
!pip install zeugma

Collecting zeugma
  Downloading zeugma-0.48.tar.gz (10 kB)
Collecting gensim>=3.5.0
  Downloading gensim-3.8.3-cp37-cp37m-macosx_10_9_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 5.3 MB/s eta 0:00:01
Collecting tensorflow>=1.5.0
  Downloading tensorflow-2.3.1-cp37-cp37m-macosx_10_9_x86_64.whl (165.1 MB)
[K     |████████████████████████████████| 165.1 MB 40 kB/s  eta 0:00:013
[?25hCollecting keras>=2.1.3
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Collecting smart-open>=1.8.1
  Downloading smart_open-4.0.1.tar.gz (117 kB)
[K     |████████████████████████████████| 117 kB 17.2 MB/s eta 0:00:01
Collecting tensorflow-estimator<2.4.0,>=2.3.0
  Downloading tensorflow_estimator-2.3.0-py2.py3-none-any.whl (459 kB)
[K     |████████████████████████████████| 459 kB 12.8 MB/s eta 0:00:01
[?25hCollecting grpcio>=1.8.6
  Downloading grpcio-1.34.0-cp37-cp37m-macosx_10_10_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 6.7 MB/s eta 0:00:0

Installing collected packages: smart-open, gensim, tensorflow-estimator, grpcio, google-pasta, opt-einsum, termcolor, protobuf, keras-preprocessing, gast, astunparse, absl-py, pyasn1, rsa, cachetools, pyasn1-modules, google-auth, tensorboard-plugin-wit, google-auth-oauthlib, markdown, tensorboard, tensorflow, keras, zeugma
Successfully installed absl-py-0.11.0 astunparse-1.6.3 cachetools-4.1.1 gast-0.3.3 gensim-3.8.3 google-auth-1.23.0 google-auth-oauthlib-0.4.2 google-pasta-0.2.0 grpcio-1.34.0 keras-2.4.3 keras-preprocessing-1.1.2 markdown-3.3.3 opt-einsum-3.3.0 protobuf-3.14.0 pyasn1-0.4.8 pyasn1-modules-0.2.8 rsa-4.6 smart-open-4.0.1 tensorboard-2.4.0 tensorboard-plugin-wit-1.7.0 tensorflow-2.3.1 tensorflow-estimator-2.3.0 termcolor-1.1.0 zeugma-0.48


## Attempt 3: Using embeddings?

Plain rf on the data was taking way too long, imo

In [9]:
from zeugma.embeddings import EmbeddingTransformer
from sklearn.
glove = EmbeddingTransformer('glove')
X_train = glove.transform(train.text)
y_train = train.label


In [10]:
X_dev = glove.transform(dev.text)
y_dev = dev.label

In [13]:
rf = RandomForestClassifier()

In [16]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
rf.score(X_dev, y_dev)

0.5457142857142857

In [19]:
lr = LogisticRegression().fit(X_train,y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_dev, y_dev))

0.5556381614408118
0.5557142857142857


## Well, this is all quite a bummer, isn't it? 
Looks like tf-idf with logistic hecking regression is the winner. Let's tune 'er up

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf

tfidf = Tfidf()
X_train = tfidf.fit_transform(train.text)
y_train = train.label

X_dev = tfidf.transform(dev.text)
y_dev = dev.label

### Attempt 2a: TF-IDF Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
#This one finishes in way fewer iterations! Interesting

print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.706196680482501
Val set accuracy: 0.6842857142857143


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = 

## We'll come back to that. 

Let's try doing the bigger set...?

In [25]:
glob.glob('bias_data/WNC/*')

['bias_data/WNC/biased.word.dev',
 'bias_data/WNC/revision_topics.csv',
 'bias_data/WNC/biased.word.test',
 'bias_data/WNC/neutral',
 'bias_data/WNC/biased.word.train',
 'bias_data/WNC/biased.full']

In [58]:
full = pd.read_csv('bias_data/WNC/biased.full', sep='\t', header=None, error_bad_lines=False)

b'Skipping line 60908: expected 7 fields, saw 9\n'


In [59]:
full.head()[4][3]

'dennis the menace is an american animated series produced by dic entertainment, based on the comic strip by hank ketcham.'

In [60]:
full_biased = pd.DataFrame(full[3])
full_unbiased = pd.DataFrame(full[4])

In [61]:
full_biased['label'] = [1]*len(full_biased)
full_unbiased['label'] = [0]*len(full_unbiased)


full_biased.columns = ['text', 'label']
full_unbiased.columns = ['text', 'label']

In [62]:
full = pd.concat([full_biased, full_unbiased])

In [63]:
full.head()

Unnamed: 0,text,label
0,"during the campaign, controversy erupted over ...",1
1,nicaea was convoked by the emperor constantine...,1
2,it was rather unfortunate that he vehemently o...,1
3,dennis the menace is an american animated seri...,1
4,"today, on large farms, motorcycles, dogs or me...",1


In [64]:
full.tail()

Unnamed: 0,text,label
181468,"arguably the most notable was dale earnhardt, ...",0
181469,because of the genetic prepotency of the ancie...,0
181470,"influenced by serge gainsbourg, the velvet und...",0
181471,"northern indiana, for example, contains the in...",0
181472,mythology is alive and well in the modern age ...,0


In [67]:
full = full.sample(random_state=42, frac=1, replace=False)

In [68]:
full.head(250)

Unnamed: 0,text,label
130694,"on 24 january 1953 mau mau, possibly former se...",0
160318,support for windows media drm (wmdrm) (incompa...,0
98343,"when the kingdom became independent, it was co...",0
35321,in may 2007 farley mowat was claimed to be hea...,0
136178,world war ii left the united kingdom with an a...,0
...,...,...
143303,"fifteen of the sixteen german states, positing...",1
30326,the powerbook 500 series was the mainstay of t...,0
58750,this model has been rejected by the scientific...,0
75834,"on 29 august 2010, bogdan made his premier lea...",1


In [74]:
full_train = full.iloc[0:308504]
full_dev = full.iloc[308504:326651]
full_test = full.iloc[326651:]

In [72]:
len(full) * 0.90

326651.4

In [75]:
full_train.to_csv('processed_data/full_train.csv', index=False, header=False)
full_dev.to_csv('processed_data/full_dev.csv', index=False, header=False)
full_test.to_csv('processed_data/full_test.csv', index=False, header=False)

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf

tfidf = Tfidf()
X_train = tfidf.fit_transform(full_train.text)
y_train = full_train.label

X_dev = tfidf.transform(full_dev.text)
y_dev = full_dev.label

### Attempt 2a: TF-IDF Logistic Regression

lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
#This one finishes in way fewer iterations! Interesting

print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.6789928169488888
Val set accuracy: 0.5785529288587645


In [80]:
X_train = tfidf.fit_transform(train.text)
y_train = train.label

X_dev = tfidf.transform(dev.text)
y_dev = dev.label

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

print("Training set accuracy: {}".format(lr.score(X_train, y_train)))
print("Val set accuracy: {}".format(lr.score(X_dev, y_dev)))

Training set accuracy: 0.706196680482501
Val set accuracy: 0.6842857142857143


In [81]:
X_dev_full = tfidf.transform(full_dev.text)
y_dev_full = full_dev.label

In [82]:
lr.score(X_dev_full, y_dev_full)

0.6361933101890119

In [93]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tfidf', Tfidf()),
    ('clf', LogisticRegression(max_iter=1000))
])


parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1,2), (1,3)],
    'clf__C': [0.001, 0.01, 0.1, 1, 10]
}

In [94]:
from sklearn.model_selection import GridSearchCV
search = GridSearchCV(pipeline, parameters, verbose=2)
search.fit(train.text, train.label)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.6s remaining:    0.0s


[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=   8.2s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=   8.3s
[CV] clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .....
[CV]  clf__C=0.001, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=   8.2s
[CV] cl

[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3), total=  16.3s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3) ......
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3), total=  16.3s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3) ......
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3), total=  16.8s
[CV] clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3) ......
[CV]  clf__C=0.01, tfidf__max_df=0.25, tfidf__ngram_range=(1, 3), total=  16.5s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV]  clf__C=0.01, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), total=   2.6s
[CV] clf__C=0.01,

[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=  10.0s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=  10.5s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=   8.6s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), total=   8.7s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3), total=  17.2s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3), total=  20.4s
[CV] clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3) ........
[CV]  clf__C=0.1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 3), total=  22.2s
[CV] clf__C=0.1, tfidf__max_

[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1), total=   3.3s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1), total=   4.8s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1), total=   4.9s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), total=  17.8s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), total=  17.4s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), total=  16.5s
[CV] clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), total=  16.5s
[CV] clf__C=1, tfidf__max_df=0.75, 

[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__ngram_range=(1, 3), total= 1.0min
[CV] clf__C=10, tfidf__max_df=0.75, tfidf__ngram_range=(1, 3) ........
[CV]  clf__C=10, tfidf__max_df=0.75, tfidf__ngram_range=(1, 3), total= 1.3min


[Parallel(n_jobs=1)]: Done 225 out of 225 | elapsed: 57.9min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [99]:
best = search.best_estimator_

In [101]:
best.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=0.25, max_features=None,
                   min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('clf',
   LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=1000,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error=

C = 10, max_iter 100, max-df = 0.25

In [102]:
from sklearn.pipeline import Pipeline
fine_pipe = Pipeline([
    ('tfidf', Tfidf()),
    ('clf', LogisticRegression(max_iter=1000))
])


fine_parameters = {
    'tfidf__max_df': (0.25, 0.375, 0.5),
    'tfidf__ngram_range': [(1,2)],
    'clf__C': [1, 10, 25, 50]
}

In [None]:
fine_search = GridSearchCV(fine_pipe, fine_parameters, verbose=2)
fine_search.fit(train.text, train.label)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  15.0s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.0s remaining:    0.0s


[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  14.3s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  14.9s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  11.3s
[CV] clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2) .........
[CV]  clf__C=1, tfidf__max_df=0.25, tfidf__ngram_range=(1, 2), total=  11.3s
[CV] clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2), total=  14.1s
[CV] clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2), total=  13.4s
[CV] clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2) ........
[CV]  clf__C=1, tfidf__max_df=0.375, tfidf__ngram_range=(1, 2), total=  13.0s
[CV] clf__C=1, tfidf__max_df=0.3