# Bibliotecas

In [1]:
import re
import pandas as pd
import seaborn as sns
import numpy as np
from unidecode import unidecode
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

# Funções customizadas

In [2]:
def remove_punctuation(text):
    """Remove punctuation and numbers from text

    Args:
        text (str): A string

    Returns:
        str: string without punctuation and numbers
    """
    return re.sub(r'[^a-zA-Z\s]', ' ', text)

def remove_multiple_blank_spaces(text):
    """Remove multiple blank spaces from text

    Args:
        text (str): A string

    Returns:
        str: string with only one space between words
    """
    return re.sub(r'\s+', ' ', text)

# Carrega dado

In [3]:
# Load data
data = pd.read_csv("Input/archive.zip", low_memory=False)
data.head()

Unnamed: 0,title,text,date,category,subcategory,link
0,"Lula diz que está 'lascado', mas que ainda tem...",Com a possibilidade de uma condenação impedir ...,2017-09-10,poder,,http://www1.folha.uol.com.br/poder/2017/10/192...
1,"'Decidi ser escrava das mulheres que sofrem', ...","Para Oumou Sangaré, cantora e ativista malines...",2017-09-10,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2017/10...
2,Três reportagens da Folha ganham Prêmio Petrob...,Três reportagens da Folha foram vencedoras do ...,2017-09-10,poder,,http://www1.folha.uol.com.br/poder/2017/10/192...
3,Filme 'Star Wars: Os Últimos Jedi' ganha trail...,A Disney divulgou na noite desta segunda-feira...,2017-09-10,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2017/10...
4,CBSS inicia acordos com fintechs e quer 30% do...,"O CBSS, banco da holding Elopar dos sócios Bra...",2017-09-10,mercado,,http://www1.folha.uol.com.br/mercado/2017/10/1...


# First exploration

In [4]:
# Count empty values by columns
data.isnull().sum()

title               0
text              765
date                0
category            0
subcategory    137418
link                0
dtype: int64

In [37]:
# Count items by category
data.groupby("category").count().sort_values(by="title", ascending=False)

Unnamed: 0_level_0,title,text,date,subcategory,link
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
poder,22022,22022,22022,939,22022
colunas,21622,21619,21622,21622,21622
mercado,20970,20970,20970,0,20970
esporte,19730,19730,19730,2859,19730
mundo,17130,17130,17130,0,17130
cotidiano,16967,16967,16967,35,16967
ilustrada,16345,15617,16345,0,16345
opiniao,4525,4525,4525,0,4525
paineldoleitor,4011,4011,4011,260,4011
saopaulo,3955,3955,3955,471,3955


In [38]:
# To avoid unbalanced data, we will remove the categories with less than 1500 items
categories_mask = data["category"].value_counts() > 1500
categories_mask = categories_mask[categories_mask.values].index
# Filter data
mask = data["category"].isin(categories_mask)
data_filtered = data[mask]
temp_df = data.loc[~mask]
temp_df["category"] = "outros"

data_filtered = pd.concat([data_filtered, temp_df])
data_filtered.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 167053 entries, 0 to 166970
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        167053 non-null  object
 1   text         166288 non-null  object
 2   date         167053 non-null  object
 3   category     167053 non-null  object
 4   subcategory  29635 non-null   object
 5   link         167053 non-null  object
dtypes: object(6)
memory usage: 8.9+ MB


# Sampling

In [41]:
seed = 333
data_balanced = data_filtered.copy()#.groupby('category').apply(lambda x: x.sample(1500, random_state=seed))

# Store the unused data for future use
# balanced_index = set(data_balanced.unstack(level=0).index)
# balanced_categories = data_balanced.category.unique()
# mask = ~(data_filtered.index.isin(balanced_index)) & (data_filtered.category.isin(balanced_categories))
# data_unbalanced = data_filtered[mask].reset_index(drop=True)

# Balanced data
data_balanced.reset_index(drop=True, inplace=True)
data_balanced.category.value_counts()

poder             22022
colunas           21622
mercado           20970
esporte           19730
mundo             17130
cotidiano         16967
ilustrada         16345
outros            11353
opiniao            4525
paineldoleitor     4011
saopaulo           3955
tec                2260
tv                 2142
educacao           2118
turismo            1903
Name: category, dtype: int64

## 1. Text Processing

In [74]:
# Combining text and title
df_balanced = data_balanced[["title", "text", "category"]].copy()
df_balanced["full_text"] = df_balanced["title"] + " " + df_balanced["text"]
# Text cleaning
df_balanced["norm_text"] = df_balanced.full_text.apply(lambda x: remove_multiple_blank_spaces(
                                                remove_punctuation(
                                                    unidecode(str(x).lower())
                                                )
                                            ).strip())
df_balanced = df_balanced[["full_text", "norm_text", "category"]]
df_balanced.head()

Unnamed: 0,full_text,norm_text,category
0,"Lula diz que está 'lascado', mas que ainda tem...",lula diz que esta lascado mas que ainda tem fo...,poder
1,"'Decidi ser escrava das mulheres que sofrem', ...",decidi ser escrava das mulheres que sofrem diz...,ilustrada
2,Três reportagens da Folha ganham Prêmio Petrob...,tres reportagens da folha ganham premio petrob...,poder
3,Filme 'Star Wars: Os Últimos Jedi' ganha trail...,filme star wars os ultimos jedi ganha trailer ...,ilustrada
4,CBSS inicia acordos com fintechs e quer 30% do...,cbss inicia acordos com fintechs e quer do cre...,mercado


In [76]:
# Change category to numeric
df_balanced["category"] = df_balanced.category.astype("category")
category_map = {key: value for key, value in zip(df_balanced.category, df_balanced.category.cat.codes)}
df_balanced.category = df_balanced.category.cat.codes

# Split data

In [77]:
# Split Data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_balanced, test_size=0.2, random_state=314, stratify=df_balanced.category)
df_test, df_val = train_test_split(df_test, test_size=0.5, random_state=314, stratify=df_test.category)

df_train.shape, df_test.shape, df_val.shape

((133642, 3), (16705, 3), (16706, 3))

# Stop words and vectorization


In [78]:
# Stopwords
stopwords_nltk = stopwords.words('portuguese')
stopwords_spacy = spacy.load('pt_core_news_sm').Defaults.stop_words
both_stopwords = set(stopwords_nltk) | set(stopwords_spacy)
both_stopwords = list(map(unidecode, both_stopwords))

# Create a tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=both_stopwords, max_features=2000,
                             ngram_range=(1, 3), min_df=5, max_df=0.8, lowercase=True)

# Fit the vectorizer with our texts
vectorizer.fit(df_train.norm_text)

In [79]:
# Transform our texts into vectors
X_train = vectorizer.transform(df_train.norm_text)
X_val = vectorizer.transform(df_val.norm_text)
X_test = vectorizer.transform(df_test.norm_text)

y_train = df_train.category
y_val = df_val.category
y_test = df_test.category

In [80]:
X_train.shape, X_val.shape, X_test.shape

((133642, 2000), (16706, 2000), (16705, 2000))

# N-grams

In [81]:
# Sum all the columns
X_train_sum = np.sum(X_train, axis=0)

# Sort the sum of columns
sorted_ngrams = np.asarray(np.argsort(X_train_sum)[::-1]).reshape(-1)

# Get the top 10 ngrams
top_ngrams = sorted_ngrams[:30]

# Get the names of the top 10 ngrams
feature_names = np.array(vectorizer.get_feature_names_out())

# reshape a list into a list of lists with 3 "rows"
[ " - ".join(ii) for ii in np.array_split(feature_names[top_ngrams], 3) ]

['noites - seg - joesley - aereo - ex diretor - simplesmente - nesses - empreiteira - analisar - renan calheiros',
 'investigado - aceitar - casa civil - provavel - iria - expectativas - contexto - longo prazo - vir - julio',
 'ajudou - perguntas - juca - codigo - aumentou - gilmar - estimular - inacio - industrial - atuar']

# Model training

In [50]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import (accuracy_score,
                             balanced_accuracy_score, classification_report, confusion_matrix, f1_score,
                             matthews_corrcoef)

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             classification_report, confusion_matrix, f1_score,
                             matthews_corrcoef)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import time

from sklearn.calibration import CalibratedClassifierCV

import time
from sklearn.metrics import (f1_score, balanced_accuracy_score, accuracy_score,
                             classification_report, matthews_corrcoef,
                             confusion_matrix)


In [51]:
def train_models(X_train, y_train, X_valid, y_valid, n_jobs=-1):
    
    # Spot Check Algorithms
    models = []
    models.append(('Calibrated-LSVC', CalibratedClassifierCV(LinearSVC(random_state=314, class_weight='balanced'))))
    models.append(('LR', LogisticRegression(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('RF', RandomForestClassifier(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('LGBM', LGBMClassifier(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('XGB', XGBClassifier(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('MLP', MLPClassifier(random_state=314)))
    models.append(('SGD', SGDClassifier(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('NB', MultinomialNB()))
    models.append(('LSVC', LinearSVC(random_state=314, class_weight='balanced')))
    models.append(('KNN', KNeighborsClassifier(n_jobs=-1)))
    models.append(('DT', DecisionTreeClassifier(random_state=314, class_weight='balanced')))
    
    results = []
    creports = []
    
    for name, model in models:
        start_time = time.time()

        try:
            model.fit(X_train, y_train)
            pred = model.predict(X_valid)
            
        except Exception as e:
            print(f'Error {name} - {e}')
            continue 

        f1 = f1_score(y_valid, pred, average='micro')
        bacc = balanced_accuracy_score(y_valid, pred)
        acc = accuracy_score(y_valid, pred)
        cr = classification_report(y_valid, pred)
        mcc = matthews_corrcoef(y_valid, pred)
        cm = confusion_matrix(y_valid, pred)
        creports.append([name, cr, cm])

        elapsed = time.time() - start_time
        results.append([name, f1, bacc, acc, mcc, elapsed, cm, cr])

        msg = f'Name: {name} - F1: {f1:.4f} - BACC: {bacc:.4f} - ACC: {acc:.4f} - MCC: {mcc:.4f} - Elapsed: {elapsed:.2f}s'
        print(msg)
        print(cr)
        # print(cm)
        # print('*' * 20, '\n')

    columns = ['Model', 'F1', 'BACC', 'ACC', 'MCC', 'Total Time', 'Confusion Matrix', 'Classification Report']
    df_results = pd.DataFrame(results, columns=columns)
    df_results['Confusion Matrix'] = df_results['Confusion Matrix'].apply(lambda x: str(x))

    return df_results, creports

In [52]:
df_results, creports = train_models(X_train, y_train, X_val, y_val, n_jobs=-1)

Name: Calibrated-LSVC - F1: 0.8156 - BACC: 0.7649 - ACC: 0.8156 - MCC: 0.7946 - Elapsed: 98.11s
              precision    recall  f1-score   support

           0       0.77      0.73      0.75      2162
           1       0.81      0.85      0.83      1697
           2       0.77      0.81      0.79       212
           3       0.93      0.95      0.94      1973
           4       0.78      0.86      0.82      1635
           5       0.79      0.83      0.81      2097
           6       0.84      0.90      0.87      1713
           7       0.94      0.88      0.91       452
           8       0.60      0.49      0.54      1136
           9       0.99      0.97      0.98       401
          10       0.85      0.87      0.86      2203
          11       0.77      0.67      0.71       395
          12       0.67      0.57      0.62       226
          13       0.73      0.62      0.67       190
          14       0.80      0.49      0.60       214

    accuracy                          

In [72]:
df_balanced.loc[0, "norm_text"]

'lula diz que esta lascado mas que ainda tem forca como cabo eleitoral com a possibilidade de uma condenacao impedir sua candidatura em o ex presidente luiz inacio lula da silva fez nesta segunda um discurso inflamado contra a lava jato no qual disse saber que esta lascado exigiu um pedido de desculpas do juiz sergio moro e afirmou que mesmo fora da disputa pelo planalto sera um cabo eleitoral expressivo para a sucessao de michel temer segundo o petista reu em sete acoes penais o objetivo de moro e impedir sua candidatura no ano que vem desidratando o inclusive no apoio a um nome alternativo como o do ex prefeito de sao paulo fernando haddad pt caso ele nao possa concorrer a presidencia eu sei que to lascado todo dia tem um processo eu nao quero nem que moro me absolva eu so quero que ele peca desculpas disse lula durante um seminario sobre educacao em brasilia eles investigadores chegam a dizer ah se o lula nao for candidato ele nao vai ter forca como cabo eleitoral testem completou o

In [66]:
df_results.sort_values(by='F1', ascending=False)

Unnamed: 0,Model,F1,BACC,ACC,MCC,Total Time,Confusion Matrix,Classification Report
3,LGBM,0.827248,0.826563,0.827248,0.808458,510.886407,[[1595 46 10 46 81 94 56 21 95...,precision recall f1-score ...
4,XGB,0.825632,0.779012,0.825632,0.805783,1719.913379,[[1643 45 2 58 86 104 49 12 65...,precision recall f1-score ...
5,MLP,0.820544,0.77594,0.820544,0.800284,873.692021,[[1656 38 6 34 60 83 50 29 96...,precision recall f1-score ...
0,Calibrated-LSVC,0.815575,0.764922,0.815575,0.794629,98.107174,[[1576 54 4 61 99 128 61 18 68...,precision recall f1-score ...
8,LSVC,0.808512,0.80259,0.808512,0.787885,17.190052,[[1472 56 9 68 108 114 66 40 86...,precision recall f1-score ...
1,LR,0.791213,0.805785,0.791213,0.769465,31.342459,[[1416 51 13 72 105 118 62 39 114...,precision recall f1-score ...
2,RF,0.781815,0.735253,0.781815,0.75767,875.791297,[[1264 72 8 155 115 185 98 17 43...,precision recall f1-score ...
6,SGD,0.764276,0.784948,0.764276,0.741456,2.198763,[[ 986 79 23 156 147 217 108 66 72...,precision recall f1-score ...
7,NB,0.709685,0.644454,0.709685,0.678643,0.195432,[[ 725 83 9 151 192 385 114 16 165...,precision recall f1-score ...
10,DT,0.609601,0.602375,0.609601,0.565838,668.646844,[[1028 108 15 126 134 208 119 37 126...,precision recall f1-score ...


# Ensemble Classifier Explanation

In [82]:
model_lsvc = LinearSVC(random_state=seed, class_weight='balanced')
model_calibrated_lsvc = CalibratedClassifierCV(LinearSVC(random_state=seed, class_weight='balanced'))
model_lgbm = LGBMClassifier(random_state=seed, n_jobs=-1, class_weight='balanced')

estimators = [
                ('lsvc', model_lsvc),
                ('calibrated_lsvc', model_calibrated_lsvc),
                ('lgbm', model_lgbm)
            ]

model_stacked = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=seed, n_jobs=-1, class_weight='balanced'), n_jobs=2, cv=5)

model_stacked.fit(X_train, y_train)

pred = model_stacked.predict(X_val)

f1 = f1_score(y_val, pred, average='micro')
bacc = balanced_accuracy_score(y_val, pred)
acc = accuracy_score(y_val, pred)
cr = classification_report(y_val, pred)
mcc = matthews_corrcoef(y_val, pred)
cm = confusion_matrix(y_val, pred)

print(f'F1: {f1:.4f} - BACC: {bacc:.4f} - ACC: {acc:.4f} - MCC: {mcc:.4f}')
print(cr)
print(cm)

# Took 1.5 minute to run in a 48 core CPU

F1: 0.8247 - BACC: 0.8399 - ACC: 0.8247 - MCC: 0.8064
              precision    recall  f1-score   support

           0       0.86      0.75      0.80      2162
           1       0.85      0.83      0.84      1697
           2       0.62      0.92      0.74       212
           3       0.96      0.95      0.95      1973
           4       0.84      0.83      0.84      1635
           5       0.86      0.78      0.81      2097
           6       0.89      0.88      0.88      1713
           7       0.89      0.92      0.90       452
           8       0.61      0.54      0.57      1136
           9       0.98      0.98      0.98       401
          10       0.89      0.87      0.88      2203
          11       0.64      0.82      0.72       395
          12       0.41      0.88      0.56       226
          13       0.48      0.84      0.61       190
          14       0.51      0.82      0.63       214

    accuracy                           0.82     16706
   macro avg       0.75   

# Hyperparameter Optimization

In [68]:
# Performing grid search
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

scorer_mcc = make_scorer(matthews_corrcoef)

model_sgd = SGDClassifier(random_state=314, n_jobs=-1, class_weight='balanced')

# Define parameter grid for the search
param_grid = {
    'loss': ['hinge', 'log_loss', 'squared_hinge', 'modified_huber'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0],
    'max_iter': [1000, 2000, 3000, 4000, 5000],
    'penalty': ['l2', 'l1', 'elasticnet'],
}

# Create the GridSearchCV object with the SGDClassifier and parameter grid
grid_search = GridSearchCV(model_sgd, param_grid, cv=3, scoring=scorer_mcc, n_jobs=-1)

# Perform the grid search by fitting training data
grid_search.fit(X_train, y_train)

# Print the best parameters and corresponding MCC score found by the grid search
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Evaluate the model with chosen parameters on the test set
best_estimator = grid_search.best_estimator_
valid_mcc = best_estimator.score(X_val, y_val)
print("Valid MCC: ", valid_mcc)

# Took 65 minutes to run in a 48 core CPU
# it performs 4 * 5 * 5 * 3 combinations of parameters, which is 300 combinations in total. Since it uses Cross Validation with 3 folds, it will train 900 models in total!!!!!

# Best parameters:  {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l1'}
# Best score:  0.8971782029568908
# Valid MCC:  0.9570773263433814

24.683333333333334

In [None]:
# Performing Random Search
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

scorer_mcc = make_scorer(matthews_corrcoef)

model_sgd = SGDClassifier(random_state=314, n_jobs=-1, class_weight='balanced')

# Define parameter grid for the search
param_dist = {
    'loss': ['hinge', 'log_loss', 'squared_hinge', 'modified_huber'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0],
    'max_iter': [1000, 2000, 3000, 4000, 5000],
    'penalty': ['l2', 'l1', 'elasticnet'],
}

# Create the RandomizedSearchCV object with the SGDClassifier and parameter distribution
random_search = RandomizedSearchCV(model_sgd, param_dist, cv=3, scoring=scorer_mcc, 
                                   n_jobs=-1, n_iter=60, random_state=314)

# Perform the random search by fitting training data
random_search.fit(X_train, y_train)

# Print the best parameters and corresponding MCC score found by the random search
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# Evaluate the model with chosen parameters on the test set
best_estimator = random_search.best_estimator_
valid_mcc = best_estimator.score(X_val, y_val)
print("Valid MCC: ", valid_mcc)

# This will typically run faster than GridSearchCV due to the reduced number of parameter combinations.
# Time to run: 26 minutes in a 48 core CPU
# Best parameters:  {'penalty': 'elasticnet', 'max_iter': 2000, 'loss': 'modified_huber', 'alpha': 0.0001}
# Best score:  0.9068535026549907
# Valid MCC:  0.963302752293578

In [None]:
# Performing Bayesian Optimization for Hyperparameter Tuning
import numpy as np
from skopt import BayesSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import SGDClassifier

scorer_mcc = make_scorer(matthews_corrcoef)

model_sgd = SGDClassifier(random_state=314, n_jobs=-1, class_weight='balanced')

# Define parameter search space for the optimizer
param_space = {
    'loss': ['hinge', 'log_loss', 'squared_hinge', 'modified_huber'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0],
    'max_iter': [1000, 2000, 3000, 4000, 5000],
    'penalty': ['l2', 'l1', 'elasticnet'],
}
# Create the BayesSearchCV object with the SGDClassifier and parameter distribution
bayes_search = BayesSearchCV(model_sgd, param_space, cv=3, scoring=scorer_mcc, 
                             n_jobs=-1, n_iter=30, random_state=314)

# Perform the Bayesian optimization by fitting training data
bayes_search.fit(X_train, y_train)

# Print the best parameters and corresponding MCC score found by the Bayesian search
print("Best parameters: ", bayes_search.best_params_)
print("Best score: ", bayes_search.best_score_)

# Evaluate the model with chosen parameters on the test set
best_estimator = bayes_search.best_estimator_
valid_mcc = best_estimator.score(X_val, y_val)
print("Valid MCC: ", valid_mcc)

# Time to run: 36 minutes in a 48 core CPU
# Best parameters:  OrderedDict([('alpha', 0.0001), ('loss', 'modified_huber'), ('max_iter', 2000), ('penalty', 'elasticnet')])
# Best score:  0.9068535026549907
# Valid MCC:  0.963302752293578