Not ideal approach since the test/train split is being done after fit_transform

In [1]:
import numpy as np
import os
import pandas as pd
import time

from itertools import product

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def dataset_file_path(fname):
    return os.path.join(os.path.join(os.getcwd(), "dataset", fname))

df = pd.read_pickle(dataset_file_path("articles_dataframe.pkl"))

additional_stopwords = ['amp', 'com', 'href', 'htm', 'html', 'http', 'https', 'php', 'searchindex', 'solr', 'www']
stop_words = list(text.ENGLISH_STOP_WORDS.union(additional_stopwords))

In [2]:
cv_max_features = [1000, 2500]
cv_ngram_upper_range = [1, 2]
cv_min_df = [5, 10]
cv_max_df = [0.6, 0.7]

# fetch cartesian product of feature ranges for grid search
cv_params = list(product(cv_max_features, cv_ngram_upper_range, cv_min_df, cv_max_df))

x_train, x_test, y_train, y_test = train_test_split(df, 
                                                df['bias'], 
                                                test_size=0.2, 
                                                random_state=42, 
                                                stratify=df['bias'])


cv_result = []
for max_features, ngram_upper_range, min_df, max_df in cv_params:

    print("----------------")
    print(f"max_features {max_features}, ngram_upper_range {ngram_upper_range}, min_df {min_df}, max_df {max_df}")

    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words=stop_words, 
        max_features=max_features,
        min_df=min_df,
        max_df=max_df,
        ngram_range=(1,ngram_upper_range))

    vec_time = time.time()
    bow_train = vectorizer.fit_transform(x_train['article'])
    vec_time = time.time() - vec_time
    print(f"vec_time {vec_time:.1f}")

    bow_train_df = pd.DataFrame(
        bow_train.toarray(), 
        columns=vectorizer.get_feature_names_out())

    logregmodel = LogisticRegression(max_iter=200, random_state=42, solver='lbfgs', penalty='l2')
    model_time = time.time()
    logregmodel.fit(bow_train_df, y_train)
    model_time = time.time() - model_time

    bow_test = vectorizer.transform(x_test['article'])
    bow_test_df = pd.DataFrame(
        bow_test.toarray(), 
        columns=vectorizer.get_feature_names_out())
    
    model_score = logregmodel.score(bow_test_df, y_test)

    cv_result.append([model_score, max_features, ngram_upper_range, min_df, max_df, vec_time, model_time])
    print(f"model_time {model_time:.1f}")
    print(f"model_score {model_score:.3f}")

cv_df = pd.DataFrame(
    cv_result,
    columns = ['model_score', 'max_features', 'ngram_upper_range', 'min_df', 'max_df', 'vectorizer_time', 'train_time'])
pd.options.display.float_format = "{:,.2f}".format
pd.options.display.max_rows = len(cv_result)
display(cv_df)

----------------
max_features 1000, ngram_upper_range 1, min_df 5, max_df 0.6
vec_time 24.4
model_time 10.7
model_score 0.721
----------------
max_features 1000, ngram_upper_range 1, min_df 5, max_df 0.7
vec_time 24.0
model_time 10.6
model_score 0.721
----------------
max_features 1000, ngram_upper_range 1, min_df 10, max_df 0.6
vec_time 24.5
model_time 10.0
model_score 0.721
----------------
max_features 1000, ngram_upper_range 1, min_df 10, max_df 0.7
vec_time 24.5
model_time 10.5
model_score 0.721
----------------
max_features 1000, ngram_upper_range 2, min_df 5, max_df 0.6
vec_time 124.9
model_time 10.5
model_score 0.723
----------------
max_features 1000, ngram_upper_range 2, min_df 5, max_df 0.7
vec_time 125.5
model_time 10.1
model_score 0.723
----------------
max_features 1000, ngram_upper_range 2, min_df 10, max_df 0.6
vec_time 126.3
model_time 12.6
model_score 0.723
----------------
max_features 1000, ngram_upper_range 2, min_df 10, max_df 0.7
vec_time 127.3
model_time 10.0
mo

Unnamed: 0,model_score,max_features,ngram_upper_range,min_df,max_df,vectorizer_time,train_time
0,0.72,1000,1,5,0.6,24.44,10.73
1,0.72,1000,1,5,0.7,23.96,10.64
2,0.72,1000,1,10,0.6,24.46,10.03
3,0.72,1000,1,10,0.7,24.51,10.47
4,0.72,1000,2,5,0.6,124.87,10.49
5,0.72,1000,2,5,0.7,125.47,10.14
6,0.72,1000,2,10,0.6,126.26,12.6
7,0.72,1000,2,10,0.7,127.29,10.03
8,0.77,2500,1,5,0.6,23.79,44.29
9,0.77,2500,1,5,0.7,25.09,42.46


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

x_train, x_test, y_train, y_test = train_test_split(df, 
                                                df['bias'], 
                                                test_size=0.2, 
                                                random_state=42, 
                                                stratify=df['bias'])

pipeline = Pipeline([
    ("vec", TfidfVectorizer(lowercase = True, stop_words = stop_words)),
    ("lr", LogisticRegression(max_iter=200, random_state=42, solver='lbfgs', penalty='l2')),
])


parameters = {
    'vec__max_features': [1000],
    'vec__ngram_range': [(1,1)],
    'vec__min_df': [5],
    'vec__max_df': [0.6],
#    'vec__max_features': [1000, 2500],
#    'vec__ngram_range': [(1,1), (1,2)],
#    'vec__min_df': [5, 10],
#    'vec__max_df': [0.6, 0.7],
}

cv_grid = GridSearchCV(
    pipeline,
    param_grid = parameters,
    scoring= "accuracy",
    verbose = 3,
    n_jobs = 1)

cv_grid.fit(x_train['article'], y_train)

# lrc = cv_grid.best_estimator_
# lrc.score(x_test['article'], y_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END vec__max_df=0.6, vec__max_features=1000, vec__min_df=5, vec__ngram_range=(1, 1);, score=0.725 total time=  29.7s
[CV 2/5] END vec__max_df=0.6, vec__max_features=1000, vec__min_df=5, vec__ngram_range=(1, 1);, score=0.714 total time=  28.6s
[CV 3/5] END vec__max_df=0.6, vec__max_features=1000, vec__min_df=5, vec__ngram_range=(1, 1);, score=0.719 total time=  29.2s
[CV 4/5] END vec__max_df=0.6, vec__max_features=1000, vec__min_df=5, vec__ngram_range=(1, 1);, score=0.714 total time=  29.7s
[CV 5/5] END vec__max_df=0.6, vec__max_features=1000, vec__min_df=5, vec__ngram_range=(1, 1);, score=0.718 total time=  29.3s


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'vec__max_df': [0.6], 'vec__max_features': [1000], 'vec__min_df': [5], 'vec__ngram_range': [(1, ...)]}"
,scoring,'accuracy'
,n_jobs,1
,refit,True
,cv,
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['via', 'hers', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,200


In [3]:
pipe = cv_grid.best_estimator_

In [4]:
type(pipe)

sklearn.pipeline.Pipeline

In [6]:
vec = pipe['vec']

In [7]:
type(vec)

sklearn.feature_extraction.text.TfidfVectorizer

In [8]:
lr_mode = pipe['lr']

In [9]:
lr_mode

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,200
