In [None]:
#Importações necessárias
import os
import tarfile
import urllib.request
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
#Função para baixar e extrair o dataset
def download_and_extract_imdb(data_dir="aclImdb"):
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    filename = "aclImdb_v1.tar.gz"

    if not os.path.exists(data_dir):
        print("Baixando o dataset...")
        urllib.request.urlretrieve(url, filename)
        print("Extraindo os arquivos...")
        with tarfile.open(filename, "r:gz") as tar:
            tar.extractall()
        os.remove(filename)
        print("Download e extração concluídos.")
    else:
        print("Dataset já disponível.")

# Função para carregar resenhas com rótulos
def load_reviews(data_dir, label, limit=None):
    texts, labels = [], []
    stop_words = set(stopwords.words('english'))

    for i, filename in enumerate(os.listdir(data_dir)):
        if filename.endswith(".txt"):
            with open(os.path.join(data_dir, filename), encoding="utf-8") as f:
                text = f.read()
                tokens = word_tokenize(text)
                words = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
                cleaned_text = " ".join(words)
                texts.append(cleaned_text)
                labels.append(label)

            if limit and i + 1 >= limit:
                break

    return texts, labels

In [None]:
download_and_extract_imdb()

#Carregando resenhas positivas e negativas...
pos_texts, pos_labels = load_reviews("aclImdb/train/pos", 1, limit=1000)
neg_texts, neg_labels = load_reviews("aclImdb/train/neg", 0, limit=1000)

texts = pos_texts + neg_texts
labels = pos_labels + neg_labels

#6.A-Convertendo texto em vetores TF-IDF...
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts)
y = labels

#6.B-Classificação com Regressão Logística...
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("Avaliação do modelo:")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(f"Acurácia: {accuracy_score(y_test, y_pred):.4f}")