In [1]:
import pandas as pd
import re
from unidecode import unidecode
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import LeaveOneOut
from sklearn.naive_bayes import GaussianNB
from tqdm import tqdm 

import random

random.seed(47)

In [2]:
df_train = pd.read_csv("../train.csv", encoding="UTF8")

## Normalize 

In [3]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'https\S+', '', text)
    text = unidecode(text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text

In [4]:
df_train["text"] = df_train["text"].apply(normalize_text)

## Vetorizacao

In [5]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_train["text"]).toarray()
# X_test = vectorizer.fit_transform(df_test["text"])
y = df_train["target"].to_numpy()

## Leave One Out

In [13]:
loo = LeaveOneOut()
accuracy = 0
n = loo.get_n_splits(X)

print(f"Numero de etapas: {n}")
for i, (train_index, test_index) in tqdm(enumerate(loo.split(X))):
    nb = GaussianNB()
    nb.fit(X[train_index], y[train_index])
    accuracy += accuracy_score(y[test_index], nb.predict(X[test_index]))

accuracy/n

0.6061999211874425

## K-Fold

In [6]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
n = kf.get_n_splits(X)
accuracy = 0

print(f"Numero de etapas: {n}")
for i, (train_index, test_index) in tqdm(enumerate(kf.split(X))):
    nb = GaussianNB()
    nb.fit(X[train_index], y[train_index])
    accuracy += accuracy_score(y[test_index], nb.predict(X[test_index]))

accuracy/n

5it [00:10,  2.14s/it]


0.5925425559726765

In [21]:
X.shape

(7613, 21637)

In [28]:
y

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [29]:
X[1].toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [30]:
nb.fit(X.toarray(), y)

In [33]:
accuracy_score(y, nb.predict(X.toarray()))

0.9527124655195061

In [46]:
nb.predict(vectorizer.transform(["just happened a terrible car crash"]).toarray())

array([1], dtype=int64)