# Bibliotecas

In [1]:
import re
import pandas as pd
import seaborn as sns
import numpy as np
from unidecode import unidecode
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

# Funções customizadas

In [2]:
def remove_punctuation(text):
    """Remove punctuation and numbers from text

    Args:
        text (str): A string

    Returns:
        str: string without punctuation and numbers
    """
    return re.sub(r'[^a-zA-Z\s]', ' ', text)

def remove_multiple_blank_spaces(text):
    """Remove multiple blank spaces from text

    Args:
        text (str): A string

    Returns:
        str: string with only one space between words
    """
    return re.sub(r'\s+', ' ', text)

# Carrega dado

In [3]:
%%time
# Load data
data = pd.read_csv("Input/archive.zip", low_memory=False)
data.head()

CPU times: user 6.42 s, sys: 656 ms, total: 7.07 s
Wall time: 7.07 s


Unnamed: 0,title,text,date,category,subcategory,link
0,"Lula diz que está 'lascado', mas que ainda tem...",Com a possibilidade de uma condenação impedir ...,2017-09-10,poder,,http://www1.folha.uol.com.br/poder/2017/10/192...
1,"'Decidi ser escrava das mulheres que sofrem', ...","Para Oumou Sangaré, cantora e ativista malines...",2017-09-10,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2017/10...
2,Três reportagens da Folha ganham Prêmio Petrob...,Três reportagens da Folha foram vencedoras do ...,2017-09-10,poder,,http://www1.folha.uol.com.br/poder/2017/10/192...
3,Filme 'Star Wars: Os Últimos Jedi' ganha trail...,A Disney divulgou na noite desta segunda-feira...,2017-09-10,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2017/10...
4,CBSS inicia acordos com fintechs e quer 30% do...,"O CBSS, banco da holding Elopar dos sócios Bra...",2017-09-10,mercado,,http://www1.folha.uol.com.br/mercado/2017/10/1...


# First exploration

In [4]:
%%time
# Count empty values by columns
data.isnull().sum()

CPU times: user 46 ms, sys: 0 ns, total: 46 ms
Wall time: 44.5 ms


title               0
text              765
date                0
category            0
subcategory    137418
link                0
dtype: int64

In [5]:
%%time
# Count items by category
data.groupby("category").count().sort_values(by="title", ascending=False)

CPU times: user 51.8 ms, sys: 3.96 ms, total: 55.7 ms
Wall time: 53.7 ms


Unnamed: 0_level_0,title,text,date,subcategory,link
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
poder,22022,22022,22022,939,22022
colunas,21622,21619,21622,21622,21622
mercado,20970,20970,20970,0,20970
esporte,19730,19730,19730,2859,19730
mundo,17130,17130,17130,0,17130
cotidiano,16967,16967,16967,35,16967
ilustrada,16345,15617,16345,0,16345
opiniao,4525,4525,4525,0,4525
paineldoleitor,4011,4011,4011,260,4011
saopaulo,3955,3955,3955,471,3955


In [6]:
%%time
# To avoid unbalanced data, we will remove the categories with less than 1500 items
categories_mask = data["category"].value_counts() > 1500
categories_mask = categories_mask[categories_mask.values].index
# Filter data
mask = data["category"].isin(categories_mask)
data_filtered = data[mask]
temp_df = data.loc[~mask]
temp_df["category"] = "outros"

data_filtered = pd.concat([data_filtered, temp_df])
data_filtered.info()


<class 'pandas.core.frame.DataFrame'>
Index: 167053 entries, 0 to 166970
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        167053 non-null  object
 1   text         166288 non-null  object
 2   date         167053 non-null  object
 3   category     167053 non-null  object
 4   subcategory  29635 non-null   object
 5   link         167053 non-null  object
dtypes: object(6)
memory usage: 8.9+ MB
CPU times: user 99.7 ms, sys: 12.7 ms, total: 112 ms
Wall time: 111 ms


# Sampling

In [7]:
%%time
seed = 333
data_balanced = data_filtered.copy().groupby('category').apply(lambda x: x.sample(1500, random_state=seed))

# Store the unused data for future use
# balanced_index = set(data_balanced.unstack(level=0).index)
# balanced_categories = data_balanced.category.unique()
# mask = ~(data_filtered.index.isin(balanced_index)) & (data_filtered.category.isin(balanced_categories))
# data_unbalanced = data_filtered[mask].reset_index(drop=True)

# Balanced data
data_balanced.reset_index(drop=True, inplace=True)
data_balanced.category.value_counts()

CPU times: user 137 ms, sys: 14.6 ms, total: 151 ms
Wall time: 148 ms


category
colunas           1500
cotidiano         1500
educacao          1500
esporte           1500
ilustrada         1500
mercado           1500
mundo             1500
opiniao           1500
outros            1500
paineldoleitor    1500
poder             1500
saopaulo          1500
tec               1500
turismo           1500
tv                1500
Name: count, dtype: int64

## 1. Text Processing

In [8]:
%%time
# Combining text and title
df_balanced = data_balanced[["title", "text", "category"]].copy()
df_balanced["full_text"] = df_balanced["title"] + " " + df_balanced["text"]
# Text cleaning
df_balanced["norm_text"] = df_balanced.full_text.apply(lambda x: remove_multiple_blank_spaces(
                                                remove_punctuation(
                                                    unidecode(str(x).lower())
                                                )
                                            ).strip())
df_balanced = df_balanced[["full_text", "norm_text", "category"]]
df_balanced.head()

CPU times: user 16.3 s, sys: 78.8 ms, total: 16.4 s
Wall time: 16.4 s


Unnamed: 0,full_text,norm_text,category
0,Zuckerberg ataca isolacionismo e promete sufoc...,zuckerberg ataca isolacionismo e promete sufoc...,colunas
1,2017 será sonho ou pesadelo para Temer? BRASÍL...,sera sonho ou pesadelo para temer brasilia dep...,colunas
2,Aviso Excepcionalmente nesta quarta (8) a colu...,aviso excepcionalmente nesta quarta a coluna n...,colunas
3,Visitar hortas virou passeio Estou em Campos d...,visitar hortas virou passeio estou em campos d...,colunas
4,O fiu-fiu de um macho não opressor Sou a favor...,o fiu fiu de um macho nao opressor sou a favor...,colunas


In [9]:
%%time
# Change category to numeric
df_balanced["category"] = df_balanced.category.astype("category")
category_map = {key: value for key, value in zip(df_balanced.category, df_balanced.category.cat.codes)}
df_balanced.category = df_balanced.category.cat.codes

CPU times: user 2.69 ms, sys: 3.91 ms, total: 6.6 ms
Wall time: 5.53 ms


# Split data

In [10]:
%%time
# Split Data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_balanced, test_size=0.2, random_state=seed, stratify=df_balanced.category)
df_test, df_val = train_test_split(df_test, test_size=0.5, random_state=seed, stratify=df_test.category)

df_train.shape, df_test.shape, df_val.shape

CPU times: user 11.8 ms, sys: 3.98 ms, total: 15.7 ms
Wall time: 14.3 ms


((18000, 3), (2250, 3), (2250, 3))

# Stop words and vectorization


In [11]:
# ! python3 -m spacy download pt_core_news_sm
# nltk.download('stopwords')

In [12]:
%%time
# Stopwords
stopwords_nltk = stopwords.words('portuguese')
stopwords_spacy = spacy.load('pt_core_news_sm').Defaults.stop_words
both_stopwords = set(stopwords_nltk) | set(stopwords_spacy)
both_stopwords = list(map(unidecode, both_stopwords))

# Create a tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=both_stopwords, max_features=2000,
                             ngram_range=(1, 3), min_df=5, max_df=0.8, lowercase=True)

# Fit the vectorizer with our texts
vectorizer.fit(df_train.norm_text)

CPU times: user 35.1 s, sys: 1.66 s, total: 36.8 s
Wall time: 36.7 s


In [13]:
%%time
# Transform our texts into vectors
X_train = vectorizer.transform(df_train.norm_text)
X_val = vectorizer.transform(df_val.norm_text)
X_test = vectorizer.transform(df_test.norm_text)

y_train = df_train.category
y_val = df_val.category
y_test = df_test.category

CPU times: user 10.3 s, sys: 3.78 ms, total: 10.3 s
Wall time: 10.3 s


In [14]:
%%time
X_train.shape, X_val.shape, X_test.shape

CPU times: user 10 µs, sys: 1 µs, total: 11 µs
Wall time: 15.3 µs


((18000, 2000), (2250, 2000), (2250, 2000))

# N-grams

In [15]:
%%time
# Sum all the columns
X_train_sum = np.sum(X_train, axis=0)

# Sort the sum of columns
sorted_ngrams = np.asarray(np.argsort(X_train_sum)[::-1]).reshape(-1)

# Get the top 10 ngrams
top_ngrams = sorted_ngrams[:30]

# Get the names of the top 10 ngrams
feature_names = np.array(vectorizer.get_feature_names_out())

# reshape a list into a list of lists with 3 "rows"
[ " - ".join(ii) for ii in np.array_split(feature_names[top_ngrams], 3) ]

CPU times: user 17.1 ms, sys: 56 µs, total: 17.2 ms
Wall time: 15.7 ms


['seg sex - estac - pensao - regiao oeste tel - debate problemas brasileiros - mundiais refletir - mundiais refletir diversas - publicados assinatura - publicados assinatura traduzem - publicacao obedece proposito',
 'publicacao obedece - refletir diversas - brasileiros mundiais refletir - brasileiros mundiais - assinatura traduzem - br artigos publicados - assinatura traduzem opiniao - artigos publicados assinatura - jornal publicacao - obedece proposito',
 'obedece proposito estimular - jornal publicacao obedece - estimular debate problemas - refletir diversas tendencias - proposito estimular debate - opiniao jornal publicacao - br artigos - problemas brasileiros mundiais - diversas tendencias - traduzem opiniao jornal']

# Model training

In [16]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import (accuracy_score,
                             balanced_accuracy_score, classification_report, confusion_matrix, f1_score,
                             matthews_corrcoef)

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             classification_report, confusion_matrix, f1_score,
                             matthews_corrcoef)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import time

from sklearn.calibration import CalibratedClassifierCV

import time
from sklearn.metrics import (f1_score, balanced_accuracy_score, accuracy_score,
                             classification_report, matthews_corrcoef,
                             confusion_matrix)


In [17]:
def train_models(X_train, y_train, X_valid, y_valid, n_jobs=-1):
    
    # Spot Check Algorithms
    models = []
    models.append(('Calibrated-LSVC', CalibratedClassifierCV(LinearSVC(random_state=seed, class_weight='balanced'))))
    models.append(('LR', LogisticRegression(random_state=seed, n_jobs=-1, class_weight='balanced')))
    models.append(('RF', RandomForestClassifier(random_state=seed, n_jobs=-1, class_weight='balanced')))
    models.append(('LGBM', LGBMClassifier(random_state=seed, n_jobs=-1, class_weight='balanced')))
    models.append(('XGB', XGBClassifier(random_state=seed, n_jobs=-1, class_weight='balanced')))
    models.append(('MLP', MLPClassifier(random_state=seed)))
    models.append(('SGD', SGDClassifier(random_state=seed, n_jobs=-1, class_weight='balanced')))
    models.append(('NB', MultinomialNB()))
    models.append(('LSVC', LinearSVC(random_state=seed, class_weight='balanced')))
    models.append(('KNN', KNeighborsClassifier(n_jobs=-1)))
    models.append(('DT', DecisionTreeClassifier(random_state=seed, class_weight='balanced')))
    
    results = []
    creports = []
    
    for name, model in models:
        start_time = time.time()

        try:
            model.fit(X_train, y_train)
            pred = model.predict(X_valid)
            
        except Exception as e:
            print(f'Error {name} - {e}')
            continue 

        f1 = f1_score(y_valid, pred, average='micro')
        bacc = balanced_accuracy_score(y_valid, pred)
        acc = accuracy_score(y_valid, pred)
        cr = classification_report(y_valid, pred)
        mcc = matthews_corrcoef(y_valid, pred)
        cm = confusion_matrix(y_valid, pred)
        creports.append([name, cr, cm])

        elapsed = time.time() - start_time
        results.append([name, f1, bacc, acc, mcc, elapsed, cm, cr])

        msg = f'Name: {name} - F1: {f1:.4f} - BACC: {bacc:.4f} - ACC: {acc:.4f} - MCC: {mcc:.4f} - Elapsed: {elapsed:.2f}s'
        print(msg)
        # print(cr)
        # print(cm)
        # print('*' * 20, '\n')

    columns = ['Model', 'F1', 'BACC', 'ACC', 'MCC', 'Total Time', 'Confusion Matrix', 'Classification Report']
    df_results = pd.DataFrame(results, columns=columns)
    df_results['Confusion Matrix'] = df_results['Confusion Matrix'].apply(lambda x: str(x))

    return df_results, creports

In [18]:
%%time
df_results, creports = train_models(X_train, y_train, X_val, y_val, n_jobs=-1)

Name: Calibrated-LSVC - F1: 0.7804 - BACC: 0.7804 - ACC: 0.7804 - MCC: 0.7649 - Elapsed: 5.38s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Name: LR - F1: 0.7813 - BACC: 0.7813 - ACC: 0.7813 - MCC: 0.7659 - Elapsed: 4.83s
Name: RF - F1: 0.7524 - BACC: 0.7524 - ACC: 0.7524 - MCC: 0.7361 - Elapsed: 8.34s
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 368841
[LightGBM] [Info] Number of data points in the train set: 18000, number of used features: 2000
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050

In [19]:
df_results.sort_values(by='F1', ascending=False)

Unnamed: 0,Model,F1,BACC,ACC,MCC,Total Time,Confusion Matrix,Classification Report
3,LGBM,0.801333,0.801333,0.801333,0.787323,115.546797,[[ 85 8 2 7 6 8 3 0 15 1 8 ...,precision recall f1-score ...
4,XGB,0.791111,0.791111,0.791111,0.776352,154.129872,[[ 77 8 2 6 6 9 5 0 13 0 8 ...,precision recall f1-score ...
1,LR,0.781333,0.781333,0.781333,0.765912,4.826545,[[ 68 7 1 6 12 12 6 1 16 0 12 ...,precision recall f1-score ...
8,LSVC,0.781333,0.781333,0.781333,0.76586,1.275591,[[ 78 4 2 3 12 12 8 1 10 0 8 ...,precision recall f1-score ...
6,SGD,0.780889,0.780889,0.780889,0.766061,0.249234,[[ 61 7 4 9 15 14 6 2 9 0 11 ...,precision recall f1-score ...
0,Calibrated-LSVC,0.780444,0.780444,0.780444,0.764929,5.378097,[[ 77 5 1 3 12 12 7 1 11 0 10 ...,precision recall f1-score ...
2,RF,0.752444,0.752444,0.752444,0.73614,8.33725,[[ 45 10 5 10 13 11 10 0 10 0 20 ...,precision recall f1-score ...
5,MLP,0.739111,0.739111,0.739111,0.720683,78.456497,[[ 87 6 1 1 10 6 4 1 21 0 5 ...,precision recall f1-score ...
7,NB,0.712889,0.712889,0.712889,0.694303,0.037134,[[ 29 7 2 10 16 20 8 4 23 0 23 ...,precision recall f1-score ...
10,DT,0.592444,0.592444,0.592444,0.563505,12.456806,[[ 38 12 5 10 15 14 9 0 9 4 10 ...,precision recall f1-score ...


# Ensemble Classifier Explanation

In [20]:
model_lr = LogisticRegression(random_state=seed, n_jobs=-1, class_weight='balanced'))
model_calibrated_lsvc = CalibratedClassifierCV(LinearSVC(random_state=seed, class_weight='balanced'))
model_lgbm = LGBMClassifier(random_state=seed, n_jobs=-1, class_weight='balanced')

estimators = [
                ('lr', model_lr),
                ('calibrated_lsvc', model_calibrated_lsvc),
                ('lgbm', model_lgbm)
            ]

model_stacked = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=seed, n_jobs=-1, class_weight='balanced'), n_jobs=2, cv=5)

model_stacked.fit(X_train, y_train)

pred = model_stacked.predict(X_val)

f1 = f1_score(y_val, pred, average='micro')
bacc = balanced_accuracy_score(y_val, pred)
acc = accuracy_score(y_val, pred)
cr = classification_report(y_val, pred)
mcc = matthews_corrcoef(y_val, pred)
cm = confusion_matrix(y_val, pred)

print(f'F1: {f1:.4f} - BACC: {bacc:.4f} - ACC: {acc:.4f} - MCC: {mcc:.4f}')
print(cr)
print(cm)

# Took 1.5 minute to run in a 48 core CPU

# 11 min xeon



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 368841
[LightGBM] [Info] Number of data points in the train set: 18000, number of used features: 2000
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050




You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 321369
[LightGBM] [Info] Total Bins 321399
[LightGBM] [Info] Number of data points in the train set: 14400, number of used features: 2000
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Number of data points in the train set: 14400, number of used features: 2000
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start trai







You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 322070
[LightGBM] [Info] Number of data points in the train set: 14400, number of used features: 2000
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -2.708050
Y

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


F1: 0.8187 - BACC: 0.8187 - ACC: 0.8187 - MCC: 0.8058
              precision    recall  f1-score   support

           0       0.66      0.67      0.66       150
           1       0.78      0.75      0.77       150
           2       0.93      0.95      0.94       150
           3       0.92      0.95      0.93       150
           4       0.70      0.81      0.75       150
           5       0.80      0.71      0.75       150
           6       0.85      0.85      0.85       150
           7       0.99      0.96      0.97       150
           8       0.49      0.49      0.49       150
           9       0.99      0.96      0.97       150
          10       0.78      0.82      0.80       150
          11       0.81      0.84      0.83       150
          12       0.86      0.87      0.86       150
          13       0.87      0.83      0.85       150
          14       0.87      0.83      0.85       150

    accuracy                           0.82      2250
   macro avg       0.82   

# Hyperparameter Optimization

In [21]:
# Performing grid search
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

scorer_mcc = make_scorer(matthews_corrcoef)

model_sgd = SGDClassifier(random_state=seed, n_jobs=-1, class_weight='balanced')

# Define parameter grid for the search
param_grid = {
    'loss': ['hinge', 'log_loss', 'squared_hinge', 'modified_huber'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0],
    'max_iter': [1000, 2000, 3000, 4000, 5000],
    'penalty': ['l2', 'l1', 'elasticnet'],
}

# Create the GridSearchCV object with the SGDClassifier and parameter grid
grid_search = GridSearchCV(model_sgd, param_grid, cv=3, scoring=scorer_mcc, n_jobs=-1)

# Perform the grid search by fitting training data
grid_search.fit(X_train, y_train)

# Print the best parameters and corresponding MCC score found by the grid search
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Evaluate the model with chosen parameters on the test set
best_estimator = grid_search.best_estimator_
valid_mcc = best_estimator.score(X_val, y_val)
print("Valid MCC: ", valid_mcc)

# Took 65 minutes to run in a 48 core CPU
# it performs 4 * 5 * 5 * 3 combinations of parameters, which is 300 combinations in total. Since it uses Cross Validation with 3 folds, it will train 900 models in total!!!!!

# Best parameters:  {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l1'}
# Best score:  0.8971782029568908
# Valid MCC:  0.9570773263433814

# 49 min xeon



Best parameters:  {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Best score:  0.7616581042380561
Valid MCC:  0.7808888888888889


In [22]:
# Performing Random Search
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

scorer_mcc = make_scorer(matthews_corrcoef)

model_sgd = SGDClassifier(random_state=seed, n_jobs=-1, class_weight='balanced')

# Define parameter grid for the search
param_dist = {
    'loss': ['hinge', 'log_loss', 'squared_hinge', 'modified_huber'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0],
    'max_iter': [1000, 2000, 3000, 4000, 5000],
    'penalty': ['l2', 'l1', 'elasticnet'],
}

# Create the RandomizedSearchCV object with the SGDClassifier and parameter distribution
random_search = RandomizedSearchCV(model_sgd, param_dist, cv=3, scoring=scorer_mcc, 
                                   n_jobs=-1, n_iter=60, random_state=seed)

# Perform the random search by fitting training data
random_search.fit(X_train, y_train)

# Print the best parameters and corresponding MCC score found by the random search
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# Evaluate the model with chosen parameters on the test set
best_estimator = random_search.best_estimator_
valid_mcc = best_estimator.score(X_val, y_val)
print("Valid MCC: ", valid_mcc)

# This will typically run faster than GridSearchCV due to the reduced number of parameter combinations.
# Time to run: 26 minutes in a 48 core CPU
# Best parameters:  {'penalty': 'elasticnet', 'max_iter': 2000, 'loss': 'modified_huber', 'alpha': 0.0001}
# Best score:  0.9068535026549907
# Valid MCC:  0.963302752293578

# 18 min xeon



Best parameters:  {'penalty': 'l2', 'max_iter': 1000, 'loss': 'hinge', 'alpha': 0.0001}
Best score:  0.7616581042380561
Valid MCC:  0.7808888888888889


In [23]:
# Performing Bayesian Optimization for Hyperparameter Tuning
import numpy as np
from skopt import BayesSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import SGDClassifier

scorer_mcc = make_scorer(matthews_corrcoef)

model_sgd = SGDClassifier(random_state=seed, n_jobs=-1, class_weight='balanced')

# Define parameter search space for the optimizer
param_space = {
    'loss': ['hinge', 'log_loss', 'squared_hinge', 'modified_huber'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0],
    'max_iter': [1000, 2000, 3000, 4000, 5000],
    'penalty': ['l2', 'l1', 'elasticnet'],
}
# Create the BayesSearchCV object with the SGDClassifier and parameter distribution
bayes_search = BayesSearchCV(model_sgd, param_space, cv=3, scoring=scorer_mcc, 
                             n_jobs=-1, n_iter=30, random_state=seed)

# Perform the Bayesian optimization by fitting training data
bayes_search.fit(X_train, y_train)

# Print the best parameters and corresponding MCC score found by the Bayesian search
print("Best parameters: ", bayes_search.best_params_)
print("Best score: ", bayes_search.best_score_)

# Evaluate the model with chosen parameters on the test set
best_estimator = bayes_search.best_estimator_
valid_mcc = best_estimator.score(X_val, y_val)
print("Valid MCC: ", valid_mcc)

# Time to run: 36 minutes in a 48 core CPU
# Best parameters:  OrderedDict([('alpha', 0.0001), ('loss', 'modified_huber'), ('max_iter', 2000), ('penalty', 'elasticnet')])
# Best score:  0.9068535026549907
# Valid MCC:  0.963302752293578

ModuleNotFoundError: No module named 'skopt'