In [1]:
from itertools import product
import pandas as pd
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, cross_validate

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/madson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
output_path = "../data/interim/news.csv"

df = pd.read_csv(output_path)

In [3]:
corpus = df.text.to_list()
labels = df.label.replace({"true": 1, "fake": 0})

In [4]:
vectorizers = {
    'bow': CountVectorizer(
        stop_words = nltk.corpus.stopwords.words('portuguese'),
        max_features = 1000
    ),
    'tfidf': TfidfVectorizer(
        stop_words = nltk.corpus.stopwords.words('portuguese'),
        max_features = 1000
    )
}

In [5]:
models = {
    'logistic regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(n_neighbors=13),
    # 'SVC': SVC(),
    # 'random forest': RandomForestClassifier()
}

In [6]:
pca = TruncatedSVD(500)
scaler = StandardScaler()

In [8]:
results = []
n_splits = 2
split = ShuffleSplit(n_splits=2, test_size=.2)
for vectorizer, model in product(vectorizers.items(), models.items()):
    vectorizer_name, vectorizer_ = vectorizer
    model_name, model_ = model
    pipeline = Pipeline(steps=[
        ("vectorizer", vectorizer_),
        ("pca", pca),
        ("normalize", scaler),
        ("model", model_)
    ])    
    scores = cross_validate(pipeline, corpus, labels, cv=split, scoring=['accuracy', 'f1'])
    scores['model'] = [f"{vectorizer_name}-{model_name}"] * n_splits
    results.append(scores)

In [13]:
results

[{'fit_time': array([11.73648429, 15.6973598 ]),
  'score_time': array([1.02295494, 1.57914925]),
  'test_accuracy': array([0.94583333, 0.95      ]),
  'test_f1': array([0.94483734, 0.94929577]),
  'model': ['bow-logistic regression', 'bow-logistic regression']},
 {'fit_time': array([15.21366167,  9.90649676]),
  'score_time': array([2.55434251, 1.34401941]),
  'test_accuracy': array([0.49930556, 0.51736111]),
  'test_f1': array([0.03480589, 0.02524544]),
  'model': ['bow-KNN', 'bow-KNN']},
 {'fit_time': array([11.42240334, 11.35403204]),
  'score_time': array([1.10866857, 1.75711107]),
  'test_accuracy': array([0.94375   , 0.93611111]),
  'test_f1': array([0.94440631, 0.93637621]),
  'model': ['tfidf-logistic regression', 'tfidf-logistic regression']},
 {'fit_time': array([11.77098441, 10.67070436]),
  'score_time': array([1.2042861, 1.2369945]),
  'test_accuracy': array([0.525     , 0.52222222]),
  'test_f1': array([0.67766258, 0.67331434]),
  'model': ['tfidf-KNN', 'tfidf-KNN']}]