In [1]:
# Imports
import re
import pandas as pd
import seaborn as sns
import numpy as np
from unidecode import unidecode
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

# Custom functions

In [2]:
def remove_punctuation(text):
    """Remove punctuation from text

    Args:
        text (str): A string

    Returns:
        str: string without punctuation
    """
    return re.sub(r'[^a-zA-Z\d\s]', ' ', text)

def remove_multiple_blank_spaces(text):
    """Remove multiple blank spaces from text

    Args:
        text (str): A string

    Returns:
        str: string with only one space between words
    """
    return re.sub(r'\s+', ' ', text)

# Load data

In [3]:
# Load data
data = pd.read_csv("Input/archive.zip", low_memory=False)
data.head()

Unnamed: 0,title,text,date,category,subcategory,link
0,"Lula diz que está 'lascado', mas que ainda tem...",Com a possibilidade de uma condenação impedir ...,2017-09-10,poder,,http://www1.folha.uol.com.br/poder/2017/10/192...
1,"'Decidi ser escrava das mulheres que sofrem', ...","Para Oumou Sangaré, cantora e ativista malines...",2017-09-10,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2017/10...
2,Três reportagens da Folha ganham Prêmio Petrob...,Três reportagens da Folha foram vencedoras do ...,2017-09-10,poder,,http://www1.folha.uol.com.br/poder/2017/10/192...
3,Filme 'Star Wars: Os Últimos Jedi' ganha trail...,A Disney divulgou na noite desta segunda-feira...,2017-09-10,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2017/10...
4,CBSS inicia acordos com fintechs e quer 30% do...,"O CBSS, banco da holding Elopar dos sócios Bra...",2017-09-10,mercado,,http://www1.folha.uol.com.br/mercado/2017/10/1...


# First exploration

In [4]:
# Count empty values by columns
data.isnull().sum()

title               0
text              765
date                0
category            0
subcategory    137418
link                0
dtype: int64

In [5]:
# Count items by category
data.groupby("category").count().sort_values(by="title", ascending=False)

Unnamed: 0_level_0,title,text,date,subcategory,link
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
poder,22022,22022,22022,939,22022
colunas,21622,21619,21622,21622,21622
mercado,20970,20970,20970,0,20970
esporte,19730,19730,19730,2859,19730
mundo,17130,17130,17130,0,17130
cotidiano,16967,16967,16967,35,16967
ilustrada,16345,15617,16345,0,16345
opiniao,4525,4525,4525,0,4525
paineldoleitor,4011,4011,4011,260,4011
saopaulo,3955,3955,3955,471,3955


In [6]:
# To avoid unbalanced data, we will remove the categories with less than 300 items
categories_mask = data["category"].value_counts() > 300
categories_mask = categories_mask[categories_mask.values].index
# Filter data
mask = data["category"].isin(categories_mask)
data_filtered = data[mask]
data_filtered.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 166092 entries, 0 to 167052
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        166092 non-null  object
 1   text         165335 non-null  object
 2   date         166092 non-null  object
 3   category     166092 non-null  object
 4   subcategory  29535 non-null   object
 5   link         166092 non-null  object
dtypes: object(6)
memory usage: 8.9+ MB


# Sampling

In [7]:
seed = 333
data_balanced = data_filtered.groupby('category').apply(lambda x: x.sample(300, random_state=seed))

# Store the unused data for future use
balanced_index = set(data_balanced.unstack(level=0).index)
balanced_categories = data_balanced.category.unique()
mask = ~(data_filtered.index.isin(balanced_index)) & (data_filtered.category.isin(balanced_categories))
data_unbalanced = data_filtered[mask].reset_index(drop=True)

# Balanced data
data_balanced.reset_index(drop=True, inplace=True)
data_balanced.category.value_counts()

ambiente              300
asmais                300
turismo               300
tec                   300
sobretudo             300
serafina              300
seminariosfolha       300
saopaulo              300
poder                 300
paineldoleitor        300
opiniao               300
mundo                 300
mercado               300
ilustrissima          300
ilustrada             300
folhinha              300
esporte               300
equilibrioesaude      300
empreendedorsocial    300
educacao              300
cotidiano             300
comida                300
colunas               300
ciencia               300
bbc                   300
tv                    300
Name: category, dtype: int64

## 1. Text Processing

In [8]:
# Combining text and title
df_balanced = data_balanced[["title", "text", "category"]].copy()
df_balanced["full_text"] = df_balanced["title"] + " " + df_balanced["text"]
# Text cleaning
df_balanced["norm_text"] = df_balanced.full_text.apply(lambda x: remove_multiple_blank_spaces(
                                                remove_punctuation(
                                                    unidecode(str(x).lower())
                                                )
                                            ))
df_balanced = df_balanced[["full_text", "norm_text", "category"]]
df_balanced.head()

Unnamed: 0,full_text,norm_text,category
0,"Após 20 anos, presença do raro gavião-real é r...",apos 20 anos presenca do raro gaviao real e re...,ambiente
1,"Aumento do nível do mar pode estar a caminho, ...",aumento do nivel do mar pode estar a caminho d...,ambiente
2,Temer revoga decreto sobre reserva mineral e a...,temer revoga decreto sobre reserva mineral e a...,ambiente
3,Poluição é uma das ameaças à beleza natural da...,poluicao e uma das ameacas a beleza natural da...,ambiente
4,Fernando de Noronha vira laboratório de negóci...,fernando de noronha vira laboratorio de negoci...,ambiente


In [9]:
# Change category to numeric
df_balanced["category"] = df_balanced.category.astype("category")
# df_balanced.category = df_balanced.category.cat.codes

# Split data

In [10]:
# Split Data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_balanced, test_size=0.2, random_state=314, stratify=df_balanced.category)
df_test, df_val = train_test_split(df_test, test_size=0.5, random_state=314, stratify=df_test.category)

df_train.shape, df_test.shape, df_val.shape

((6240, 3), (780, 3), (780, 3))

# Stop words and vectorization

 [ ] Conferir a questão das stopwords não funcionarem por causa da lingua

In [11]:
# Stopwords
stopwords_nltk = stopwords.words('portuguese')
# stopwords_spacy = spacy.load('pt_core_news_sm').Defaults.stop_words
# both_stopwords = set(stopwords_nltk) | set(stopwords_spacy)

# Create a tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords_nltk, max_features=2000,
                             ngram_range=(1, 2), min_df=5, max_df=0.8, lowercase=True)

# Fit the vectorizer with our texts
vectorizer.fit(df_train.norm_text)

In [12]:
# Transform our texts into vectors
X_train = vectorizer.transform(df_train.norm_text)
X_val = vectorizer.transform(df_val.norm_text)
X_test = vectorizer.transform(df_test.norm_text)

y_train = df_train.category
y_val = df_val.category
y_test = df_test.category

In [13]:
X_train.shape, X_val.shape, X_test.shape

((6240, 2000), (780, 2000), (780, 2000))

# N-grams

In [14]:
# Sum all the columns
X_train_sum = np.sum(X_train, axis=0)

# Sort the sum of columns
sorted_ngrams = np.asarray(np.argsort(X_train_sum)[::-1]).reshape(-1)

# Get the top 10 ngrams
top_ngrams = sorted_ngrams[:30]

# Get the names of the top 10 ngrams
feature_names = np.array(vectorizer.get_feature_names_out())

# reshape a list into a list of lists with 3 "rows"
[ " - ".join(ii) for ii in np.array_split(feature_names[top_ngrams], 3) ]

['acha - universidade federal - porque nao - codigo - to - passeios - nao existe - teoria - avanco - boa parte',
 'justamente - ruim - empregos - trabalhando - clara - falou - considera - longo prazo - recentes - 2004',
 'mercados - depende - estaduais - falando - tentando - 75 - pensamento - fiquei - influencia - existencia']

# Model training

In [15]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import (accuracy_score,
                             balanced_accuracy_score, classification_report, confusion_matrix, f1_score,
                             matthews_corrcoef)

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             classification_report, confusion_matrix, f1_score,
                             matthews_corrcoef)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import time

from sklearn.calibration import CalibratedClassifierCV

import time
from sklearn.metrics import (f1_score, balanced_accuracy_score, accuracy_score,
                             classification_report, matthews_corrcoef,
                             confusion_matrix)


In [18]:
def train_models(X_train, y_train, X_valid, y_valid, n_jobs=-1):
    
    # Spot Check Algorithms
    models = []
    models.append(('Calibrated-LSVC', CalibratedClassifierCV(LinearSVC(random_state=314, class_weight='balanced'))))
    models.append(('LR', LogisticRegression(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('RF', RandomForestClassifier(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('LGBM', LGBMClassifier(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('XGB', XGBClassifier(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('MLP', MLPClassifier(random_state=314)))
    models.append(('SGD', SGDClassifier(random_state=314, n_jobs=-1, class_weight='balanced')))
    models.append(('NB', MultinomialNB()))
    models.append(('LSVC', LinearSVC(random_state=314, class_weight='balanced')))
    models.append(('KNN', KNeighborsClassifier(n_jobs=-1)))
    models.append(('DT', DecisionTreeClassifier(random_state=314, class_weight='balanced')))
    
    results = []
    creports = []
    
    for name, model in models:
        start_time = time.time()

        try:
            model.fit(X_train, y_train)
            pred = model.predict(X_valid)
            
        except Exception as e:
            print(f'Error {name} - {e}')
            continue 

        f1 = f1_score(y_valid, pred, average='micro')
        bacc = balanced_accuracy_score(y_valid, pred)
        acc = accuracy_score(y_valid, pred)
        cr = classification_report(y_valid, pred)
        mcc = matthews_corrcoef(y_valid, pred)
        cm = confusion_matrix(y_valid, pred)
        creports.append([name, cr, cm])

        elapsed = time.time() - start_time
        results.append([name, f1, bacc, acc, mcc, elapsed, cm, cr])

        msg = f'Name: {name} - F1: {f1:.4f} - BACC: {bacc:.4f} - ACC: {acc:.4f} - MCC: {mcc:.4f} - Elapsed: {elapsed:.2f}s'
        print(msg)
        print(cr)
        # print(cm)
        # print('*' * 20, '\n')

    columns = ['Model', 'F1', 'BACC', 'ACC', 'MCC', 'Total Time', 'Confusion Matrix', 'Classification Report']
    df_results = pd.DataFrame(results, columns=columns)
    df_results['Confusion Matrix'] = df_results['Confusion Matrix'].apply(lambda x: str(x))

    return df_results, creports

In [19]:
df_results, creports = train_models(X_train, y_train, X_val, y_val, n_jobs=-1)

Name: Calibrated-LSVC - F1: 0.6859 - BACC: 0.6859 - ACC: 0.6859 - MCC: 0.6737 - Elapsed: 3.57s
                    precision    recall  f1-score   support

          ambiente       0.76      0.73      0.75        30
            asmais       0.40      0.27      0.32        30
               bbc       0.45      0.33      0.38        30
           ciencia       0.61      0.73      0.67        30
           colunas       0.44      0.27      0.33        30
            comida       0.71      0.80      0.75        30
         cotidiano       0.67      0.67      0.67        30
          educacao       0.77      0.90      0.83        30
empreendedorsocial       0.88      0.77      0.82        30
  equilibrioesaude       0.52      0.50      0.51        30
           esporte       0.77      0.80      0.79        30
          folhinha       0.77      0.77      0.77        30
         ilustrada       0.62      0.53      0.57        30
      ilustrissima       0.71      0.73      0.72        30
    