In [1]:
import pandas as pd
import re
from unidecode import unidecode
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import LeaveOneOut
from sklearn.naive_bayes import GaussianNB
from tqdm import tqdm 

import random

random.seed(47)

In [2]:
df_train = pd.read_csv("../train.csv", encoding="UTF8")

# Normalize

In [3]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'https\S+', '', text)
    text = unidecode(text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text

## Vetorizacao

In [4]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_train["text"]).toarray()
# X_test = vectorizer.fit_transform(df_test["text"])
y = df_train["target"].to_numpy()

## Leave One Out

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
loo = LeaveOneOut()
n = loo.get_n_splits(X)

accuracy = 0
precision = 0
recall = 0
f1 = 0

print(f"Numero de etapas: {n}")
for i, (train_index, test_index) in tqdm(enumerate(loo.split(X))):
    nb = GaussianNB()
    nb.fit(X[train_index], y[train_index])
    accuracy += accuracy_score(y[test_index], nb.predict(X[test_index]))
    precision += precision_score(y[test_index], nb.predict(X[test_index]))
    recall += recall_score(y[test_index], nb.predict(X[test_index]))
    f1 += f1_score(y[test_index], nb.predict(X[test_index]))

metrics = {
    "accuracy": accuracy/n,
    "precision":precision/n,
    "recall": recall/n, 
    "f1": f1/n
}

metrics

Numero de etapas: 7613


7613it [4:28:20,  2.11s/it]


{'accuracy': 0.6060685669249967,
 'precision': 0.343360042033364,
 'recall': 0.343360042033364,
 'f1': 0.343360042033364}