# Pré-processamento de dados
## Importando bibliotecas

In [None]:
from itertools import product
import pandas as pd
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, cross_validate

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/madson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Carregando Dados

In [2]:
output_path = "../data/interim/news.csv"

df = pd.read_csv(output_path)

In [3]:
corpus = df.text.to_list()
labels = df.label.replace({"true": 1, "fake": 0})

In [4]:
vectorizers = {
    'bow': CountVectorizer(
        stop_words = nltk.corpus.stopwords.words('portuguese'),
        max_features = 1000
    ),
    'tfidf': TfidfVectorizer(
        stop_words = nltk.corpus.stopwords.words('portuguese'),
        max_features = 1000
    )
}

In [5]:
models = {
    'logistic regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(n_neighbors=21),
    'SVC': SVC(),
    'random forest': RandomForestClassifier()
}

In [6]:
pca = TruncatedSVD(500)
scaler = StandardScaler()

In [7]:
results = {}
n_splits = 10
split = ShuffleSplit(n_splits=n_splits, test_size=.2)
for vectorizer, model in product(vectorizers.items(), models.items()):
    vectorizer_name, vectorizer_ = vectorizer
    model_name, model_ = model
    pipeline = Pipeline(steps=[
        ("vectorizer", vectorizer_),
        ("pca", pca),
        ("normalize", scaler),
        ("model", model_)
    ])    
    scores = cross_validate(pipeline, corpus, labels, cv=split, scoring=['accuracy', 'f1'])
    scores['model'] = [f"{vectorizer_name}-{model_name}"] * n_splits
    if not(results):
        results = {key: [] for key in scores}
    for key in scores:
        results[key].extend(scores[key])

In [8]:
df_results = (
    pd
    .DataFrame(results)
    .groupby("model")
    .agg([np.mean, np.std])
    .transpose()
)

In [9]:
df_results

Unnamed: 0,model,bow-KNN,bow-SVC,bow-logistic regression,bow-random forest,tfidf-KNN,tfidf-SVC,tfidf-logistic regression,tfidf-random forest
fit_time,mean,5.814648,8.637069,6.292938,21.82326,5.813607,14.061045,6.06327,16.788882
fit_time,std,0.080802,0.084062,0.187033,0.333607,0.073308,0.339682,0.079474,0.287679
score_time,mean,0.924707,1.093691,0.654466,0.612344,0.920975,3.191312,0.652346,0.611891
score_time,std,0.026855,0.047469,0.023749,0.010742,0.037061,0.079363,0.017939,0.015909
test_accuracy,mean,0.5075,0.952847,0.952292,0.946319,0.500278,0.948194,0.940486,0.914097
test_accuracy,std,0.015136,0.003456,0.005689,0.005947,0.013797,0.003092,0.005498,0.007685
test_f1,mean,0.010868,0.952411,0.95161,0.947615,0.661847,0.949383,0.940806,0.912608
test_f1,std,0.004602,0.003727,0.006216,0.005971,0.012097,0.002667,0.005817,0.00797
