# Multinomial Logistic Regression

https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
import pandas as pd
from pprint import pprint
import pickle

Using TensorFlow backend.


## Set the model name

We'll use this name as an identifier for referencing files and results.

In [3]:
model_name = 'multi_lr'

## Load the training dataset

In [4]:
df_train = pd.read_csv('../data/training.csv', index_col=0)
X_train = df_train['article_words'].values
y_train = df_train['topic'].values

print(X_train.shape, y_train.shape)

(9500,) (9500,)


## Setup the scoring

These metrics will be collected during cross-validation for the training/validation splits. We will use these metrics for model comparison and evaluation. The evaluation metric used for model selection is **macro F1 excluding the 'irrelevant' class**. Using this metric avoids the majority class skewing our model selection while weighting fairly evenly across the remaining relevant classes.

In [5]:
labels = set(y_train)
labels.remove('IRRELEVANT')
custom_f1 = make_scorer(f1_score, labels=list(labels), average='macro')

scorer = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, labels=list(labels), average='micro'),
    'recall': make_scorer(recall_score, labels=list(labels), average='micro'),
    'f1': make_scorer(f1_score, labels=list(labels), average='micro'),    
    'precision_macro': make_scorer(precision_score, labels=list(labels), average='macro'),
    'recall_macro': make_scorer(recall_score, labels=list(labels), average='macro'),
    'f1_macro': make_scorer(f1_score, labels=list(labels), average='macro'),
    'custom': custom_f1,
}

## Setup the pipeline

In [6]:
clf = make_pipeline_imb(
    TfidfVectorizer(),
    SMOTE(random_state=0,n_jobs=-1),
    LogisticRegression(random_state=0),
)

## Review configurable parameters

For each step in the pipeline, research what each parameter does and decide on a range to test in our grid search.

In [7]:
for step in clf.steps:
    print(step[0])
    pprint(step[1].get_params())

tfidfvectorizer
{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': <class 'numpy.float64'>,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}
smote
{'k_neighbors': 5, 'n_jobs': -1, 'random_state': 0, 'sampling_strategy': 'auto'}
logisticregression
{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


## Select grid parameters

In [8]:
grid_params = {
    'logisticregression__C': [0.1, 0.5, 1.0],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'saga'],
    'tfidfvectorizer__max_features': [20_000],
    'tfidfvectorizer__max_df': [0.5],
    'tfidfvectorizer__sublinear_tf': [True],
    'tfidfvectorizer__lowercase': [False],
}

## Execute the grid search

In [None]:
%%time
grid = GridSearchCV(clf,
                    grid_params,
                    cv=5,
                    n_jobs=-1,
                    scoring=scorer,
                    refit='custom',
                    verbose=1,
                    return_train_score=True)

grid.fit(X_train, y_train)

print(f'Best cross-validation score: {grid.best_score_:.2f}')
print(f'Best params: {grid.best_params_}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


## Refit the best model 

Here we take the best model found during cross-validation and fit it with the entire training dataset.

In [49]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=0.5,
                                 max_features=20000, min_df=1,
                                 ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 toke...
                ('smote',
                 SMOTE(k_neighbors=5, n_jobs=-1, random_state=0,
                       sampling_strategy='auto')),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
              

## Save model and cross-validation statistics

In [17]:
best_idx = grid.cv_results_['params'].index(grid.best_params_)

d = {
    'model': model_name,
}

for metric in scorer:
    for dataset in ['train', 'test']:
        for stat in ['mean', 'std']:
            metric_item = f'{stat}_{dataset}_{metric}'
            d[metric_item] = grid.cv_results_[metric_item][best_idx]

df_models_clf = pd.DataFrame(d, index=[0])

with open(f'../data/pickles/best_{model_name}.pickle', 'wb') as output:
    pickle.dump(best_clf, output)
    
with open(f'../data/pickles/df_results_{model_name}.pickle', 'wb') as output:
    pickle.dump(df_models_clf, output)
    
df_models_clf

Unnamed: 0,model,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_precision,std_train_precision,mean_test_precision,std_test_precision,mean_train_recall,...,mean_test_recall_macro,std_test_recall_macro,mean_train_f1_macro,std_train_f1_macro,mean_test_f1_macro,std_test_f1_macro,mean_train_custom,std_train_custom,mean_test_custom,std_test_custom
0,multi_lr,0.897237,0.001914,0.772842,0.006188,0.824478,0.002984,0.655177,0.008556,0.921737,...,0.70917,0.017699,0.890682,0.002822,0.655718,0.018572,0.890682,0.002822,0.655718,0.018572
