# Disciplina Aprendizado Profundo

### Prof. Josenalde Barbosa - IMD/UFRN

### Aluno: Bruno Santos F. Silva (matr. 2025)

#### Objetivo: elaborar um classificador usando redes neurais profundas LSTM e pytorch com a finalidade de identificar processos judiciais que ser√£o deferidos, indeferidos ou deferidos parcialmente.

Obs. Atividade avaliativa final da disciplina.

## 1. Importa√ß√£o de Bibliotecas Essenciais

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sqlite3
import warnings
import time
from collections import Counter
warnings.filterwarnings('ignore')

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR

# Processamento de texto
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import re
from sklearn.preprocessing import LabelEncoder

# M√©tricas e avalia√ß√£o
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score,
    ConfusionMatrixDisplay
)
from sklearn.model_selection import train_test_split

# Configura√ß√µes
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

# Download NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Configurar device (GPU se dispon√≠vel)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úì Device: {device}")
if torch.cuda.is_available():
    print(f"‚úì GPU: {torch.cuda.get_device_name(0)}")
    print(f"‚úì CUDA Version: {torch.version.cuda}")

print(f"‚úì PyTorch vers√£o: {torch.__version__}")
print(f"‚úì NumPy vers√£o: {np.__version__}")

## 2. Carregar e Gerar Dataset de Grande Escala (80k amostras)

In [None]:
# Par√¢metros
DB_PATH = "/gdrive/MyDrive/Colab Notebooks/JFRN/split_dados_01.sqlite3"
TABLE_NAME = "peticoes"
DATASET_SIZE = 80000

print(f"Carregando/Gerando dataset com {DATASET_SIZE:,} amostras...")

# Tentar carregar do SQLite
try:
    conn = sqlite3.connect(DB_PATH)
    query = f"SELECT * FROM {TABLE_NAME} LIMIT {DATASET_SIZE}"
    df = pd.read_sql_query(query, conn)
    conn.close()
    print(f"‚úì Dados carregados do SQLite: {df.shape}")
except Exception as e:
    print(f"‚ö† Erro ao carregar SQLite: {e}")
    print(f"Gerando dataset sint√©tico com {DATASET_SIZE:,} amostras...\n")
    
    # Corpus variado de peti√ß√µes jur√≠dicas
    corpus_templates = [
        # Peti√ß√µes deferidas (linguagem favor√°vel)
        "Fundamentado em jurisprud√™ncia consolidada, solicita-se deferimento integral do pedido",
        "Documenta√ß√£o completa e direito inequ√≠voco justificam a concess√£o pleiteada",
        "Precedentes do STJ e STF amplaram o reconhecimento desta modalidade de direito",
        "Ampla documenta√ß√£o apresentada comprova inequivocamente o direito invocado",
        "Recurso extraordin√°rio com repercuss√£o geral reconhecida sobre mat√©ria j√° pacificada",
        
        # Peti√ß√µes indeferidas (linguagem desfavor√°vel)
        "Faltam elementos essenciais √† constitui√ß√£o da rela√ß√£o jur√≠dica alegada",
        "Prescri√ß√£o consumada extingue o direito de a√ß√£o pelo lapso temporal",
        "Lacks documentary evidence and legal grounds for the claim presented",
        "Recurso manifestamente infundado contradiz jurisprud√™ncia consolidada",
        "Car√™ncia de legitimidade ativa e passiva invalida a demanda",
        
        # Peti√ß√µes parcialmente deferidas (linguagem mista)
        "Parcialmente fundado o recurso, reconhecendo-se apenas parte da pretens√£o",
        "Parte do pleito merece acolhimento, com modula√ß√£o de efeitos",
        "Alguns pedidos prosperam, outros carecem de fundamenta√ß√£o adequada",
        "Condena√ß√£o parcial procedente quanto aos danos materiais solicitados",
        "Direito reconhecido em sua integralidade, exceto quanto √† indeniza√ß√£o por lucros cessantes",
        
        # Conte√∫dos variados (neutros)
        "Conforme estatu√≠do no artigo 535 do C√≥digo de Processo Civil",
        "A legisla√ß√£o processual estabelece requisitos formais espec√≠ficos",
        "Compet√™ncia origin√°ria da Corte Superior observados os crit√©rios legais",
        "Procedimento ordin√°rio com todas as fases processuais cumpridas adequadamente",
        "Apela√ß√£o em conformidade com os prazos legalmente estabelecidos pelo c√≥digo processual",
    ]
    
    outcomes = ['Deferida', 'Indeferida', 'Parcialmente_Deferida']
    
    # Gerar dataset variado
    texts = []
    labels = []
    
    np.random.seed(42)
    for _ in range(DATASET_SIZE):
        # Combinar templates para criar varia√ß√£o
        num_templates = np.random.randint(2, 5)
        selected = np.random.choice(corpus_templates, num_templates, replace=True)
        text = ". ".join(selected) + "."
        
        # Atribuir label (com distribui√ß√£o realista)
        label = np.random.choice(outcomes, p=[0.40, 0.35, 0.25])
        
        texts.append(text)
        labels.append(label)
    
    df = pd.DataFrame({
        'texto_peticao': texts,
        'desfecho': labels,
        'data': pd.date_range('2015-01-01', periods=DATASET_SIZE, freq='h'),
        'valor': np.random.uniform(1000, 500000, DATASET_SIZE)
    })
    
    print(f"‚úì Dataset sint√©tico gerado: {df.shape}")

# An√°lise explorat√≥ria
print("\n" + "="*80)
print("AN√ÅLISE EXPLORAT√ìRIA DO DATASET")
print("="*80)
print(f"\nPrimeiras amostras:")
print(df.head(10))

print(f"\nDistribui√ß√£o de desfechos:")
distribution = df['desfecho'].value_counts()
print(distribution)
print(f"\nPercentual:")
print((df['desfecho'].value_counts(normalize=True) * 100).round(2))

# Visualizar distribui√ß√£o
fig, ax = plt.subplots(figsize=(10, 5))
distribution.plot(kind='bar', edgecolor='black', alpha=0.7, ax=ax)
ax.set_title('Distribui√ß√£o de Desfechos - Dataset Grande Escala', fontsize=14, fontweight='bold')
ax.set_xlabel('Desfecho')
ax.set_ylabel('Frequ√™ncia')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Pr√©-processamento de Texto

In [None]:
def preprocess_text(text):
    """Pr√©-processar texto para LSTM"""
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    
    tokens = word_tokenize(text, language='portuguese')
    stop_words = set(stopwords.words('portuguese'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    stemmer = SnowballStemmer('portuguese')
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

# Pr√©-processar textos
print("Pr√©-processando textos...")
start_time = time.time()

df['texto_processado'] = df['texto_peticao'].apply(preprocess_text)

elapsed = time.time() - start_time
print(f"‚úì Pr√©-processamento conclu√≠do em {elapsed:.2f}s")
print(f"  - Taxa: {len(df)/elapsed:.0f} textos/s")

print(f"\nExemplo antes: {df['texto_peticao'].iloc[0][:80]}...")
print(f"Exemplo depois: {df['texto_processado'].iloc[0][:80]}...")

# Estat√≠sticas de comprimento
text_lengths = df['texto_processado'].str.split().str.len()
print(f"\nEstat√≠sticas de comprimento:")
print(f"  - M√©dio: {text_lengths.mean():.0f} palavras")
print(f"  - Min: {text_lengths.min()}, Max: {text_lengths.max()}")
print(f"  - Mediana: {text_lengths.median():.0f}, Std: {text_lengths.std():.0f}")

# Visualizar distribui√ß√£o
fig, ax = plt.subplots(figsize=(12, 5))
ax.hist(text_lengths, bins=50, edgecolor='black', alpha=0.7)
ax.set_xlabel('N√∫mero de Palavras')
ax.set_ylabel('Frequ√™ncia')
ax.set_title('Distribui√ß√£o de Comprimento dos Textos Pr√©-processados')
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Tokeniza√ß√£o com PyTorch
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 300

print("\n" + "="*80)
print("TOKENIZA√á√ÉO E PREPARA√á√ÉO DE SEQU√äNCIAS")
print("="*80)

# Construir vocabul√°rio
print("\nConstruindo vocabul√°rio...")
word_counts = Counter()
for text in df['texto_processado']:
    words = text.split()
    word_counts.update(words)

# Manter apenas as palavras mais frequentes
vocabulary = {word: idx + 1 for idx, (word, count) in 
              enumerate(word_counts.most_common(VOCAB_SIZE - 1))}
vocabulary['<UNK>'] = 0  # Token para palavras desconhecidas

print(f"‚úì Vocabul√°rio constru√≠do: {len(vocabulary)} palavras")

# Converter textos para sequ√™ncias de √≠ndices
def text_to_sequence(text, vocab, max_len):
    words = text.split()
    sequence = [vocab.get(word, 0) for word in words]
    
    # Padding ou truncamento
    if len(sequence) < max_len:
        sequence += [0] * (max_len - len(sequence))
    else:
        sequence = sequence[:max_len]
    
    return sequence

print("\nConvertendo textos para sequ√™ncias...")
X = np.array([text_to_sequence(text, vocabulary, MAX_SEQUENCE_LENGTH) 
              for text in df['texto_processado']])

print(f"‚úì Sequ√™ncias criadas: {X.shape}")

# Codificar labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['desfecho'])

print(f"‚úì Labels codificados: {label_encoder.classes_}")
print(f"  - Mapeamento: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# Dividir dados: 70% treino, 15% valida√ß√£o, 15% teste
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.15/(0.85), random_state=42, stratify=y_temp
)

print(f"\nDivis√£o do dataset:")
print(f"  - Treino: {X_train.shape[0]:,} amostras ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"  - Valida√ß√£o: {X_val.shape[0]:,} amostras ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"  - Teste: {X_test.shape[0]:,} amostras ({X_test.shape[0]/len(X)*100:.1f}%)")

## 4. Dataset e DataLoader em PyTorch

In [None]:
class PeticionDataset(Dataset):
    """Dataset customizado para peti√ß√µes jur√≠dicas"""
    def __init__(self, X, y):
        self.X = torch.LongTensor(X)
        self.y = torch.LongTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Criar datasets
train_dataset = PeticionDataset(X_train, y_train)
val_dataset = PeticionDataset(X_val, y_val)
test_dataset = PeticionDataset(X_test, y_test)

# Hyperpar√¢metros
BATCH_SIZE = 128
NUM_WORKERS = 4

# Criar DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)

print("="*80)
print("DATALOADERS CRIADOS")
print("="*80)
print(f"‚úì Train DataLoader: {len(train_loader)} batches")
print(f"‚úì Val DataLoader: {len(val_loader)} batches")
print(f"‚úì Test DataLoader: {len(test_loader)} batches")

# Verificar um batch
X_sample, y_sample = next(iter(train_loader))
print(f"\nSample batch shape:")
print(f"  - X: {X_sample.shape}")
print(f"  - y: {y_sample.shape}")
print(f"‚úì DataLoaders prontos para treinamento!")

## 5. Arquitetura LSTM Bidirecional em PyTorch

### Caracter√≠sticas:
- **Embedding**: Converte √≠ndices em vetores densos (150D)
- **LSTM Bidirecional**: 2 camadas, 256 unidades por dire√ß√£o
- **Attention Mechanism**: Para focar em partes relevantes do texto
- **Regulariza√ß√£o**: Dropout (0.4-0.5), LayerNorm
- **Camadas Densas**: 256 ‚Üí 128 ‚Üí num_classes

In [None]:
class Attention(nn.Module):
    """Mecanismo de aten√ß√£o para LSTM"""
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attention = nn.Linear(hidden_dim * 2, 1)  # Bidirecional
    
    def forward(self, lstm_output):
        # lstm_output: (batch, seq_len, hidden_dim * 2)
        attention_weights = torch.tanh(self.attention(lstm_output))  # (batch, seq_len, 1)
        attention_weights = F.softmax(attention_weights, dim=1)  # (batch, seq_len, 1)
        
        # Aplicar pesos de aten√ß√£o
        attended_output = torch.sum(attention_weights * lstm_output, dim=1)  # (batch, hidden_dim * 2)
        return attended_output, attention_weights

class BidirectionalLSTM(nn.Module):
    """LSTM Bidirecional para classifica√ß√£o de peti√ß√µes"""
    def __init__(self, vocab_size, num_classes, embedding_dim=150, hidden_dim=256, num_layers=2):
        super(BidirectionalLSTM, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # LSTM bidirecional
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=0.3 if num_layers > 1 else 0
        )
        
        # Attention mechanism
        self.attention = Attention(hidden_dim)
        
        # Layer normalization
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        
        # Camadas densas
        self.dropout1 = nn.Dropout(0.4)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.layer_norm1 = nn.LayerNorm(hidden_dim)
        
        self.dropout2 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.layer_norm2 = nn.LayerNorm(hidden_dim // 2)
        
        self.dropout3 = nn.Dropout(0.3)
        self.fc_out = nn.Linear(hidden_dim // 2, num_classes)
    
    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)  # (batch, seq_len) -> (batch, seq_len, embed_dim)
        
        # LSTM
        lstm_out, (hidden, cell) = self.lstm(embedded)  # lstm_out: (batch, seq_len, hidden_dim * 2)
        
        # Attention
        attended_output, attention_weights = self.attention(lstm_out)  # (batch, hidden_dim * 2)
        
        # Layer normalization
        attended_output = self.layer_norm(attended_output)
        
        # Camadas densas
        x = self.dropout1(attended_output)
        x = self.fc1(x)
        x = self.layer_norm1(x)
        x = F.relu(x)
        
        x = self.dropout2(x)
        x = self.fc2(x)
        x = self.layer_norm2(x)
        x = F.relu(x)
        
        x = self.dropout3(x)
        x = self.fc_out(x)
        
        return x

# Criar modelo
print("\n" + "="*80)
print("CONSTRUINDO MODELO LSTM BIDIRECIONAL")
print("="*80)

num_classes = len(label_encoder.classes_)
model = BidirectionalLSTM(
    vocab_size=VOCAB_SIZE + 1,
    num_classes=num_classes,
    embedding_dim=150,
    hidden_dim=256,
    num_layers=2
).to(device)

# Resumo do modelo
print(f"\n‚úì Modelo criado com sucesso!")
print(f"\nArquitetura:")
print(model)

# Contar par√¢metros
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nPar√¢metros do modelo:")
print(f"  - Total: {total_params:,}")
print(f"  - Trein√°veis: {trainable_params:,}")

## 6. Compilar e Configurar Treinamento

In [None]:
print("\n" + "="*80)
print("CONFIGURANDO TREINAMENTO")
print("="*80)

# Hiperpar√¢metros
LEARNING_RATE = 0.001
EPOCHS = 30
PATIENCE = 5
WEIGHT_DECAY = 1e-5
GRADIENT_CLIP = 1.0

# Loss e otimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Scheduler
scheduler = ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,
    patience=3,
    verbose=True,
    min_lr=1e-7
)

# Vari√°veis para Early Stopping
best_val_loss = float('inf')
patience_counter = 0

# Hist√≥rico de treinamento
history = {
    'train_loss': [],
    'val_loss': [],
    'train_acc': [],
    'val_acc': []
}

print(f"\n‚úì Configura√ß√µes:")
print(f"  - Learning Rate: {LEARNING_RATE}")
print(f"  - Epochs: {EPOCHS}")
print(f"  - Batch Size: {BATCH_SIZE}")
print(f"  - Optimizer: AdamW (weight_decay={WEIGHT_DECAY})")
print(f"  - Loss: CrossEntropyLoss")
print(f"  - Gradient Clipping: {GRADIENT_CLIP}")
print(f"  - Early Stopping Patience: {PATIENCE} √©pocas")
print(f"  - Device: {device}")

## 7. Fun√ß√µes de Treinamento e Avalia√ß√£o

In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device, gradient_clip):
    """Treinar uma √©poca"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    pbar = tqdm(train_loader, desc="Treinamento")
    for inputs, labels in pbar:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=gradient_clip)
        
        optimizer.step()
        
        # Estat√≠sticas
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        pbar.set_postfix({'loss': loss.item():.4f}, refresh=True)
    
    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    
    return avg_loss, accuracy

def validate(model, val_loader, criterion, device):
    """Validar modelo"""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    avg_loss = total_loss / len(val_loader)
    accuracy = correct / total
    
    return avg_loss, accuracy

# Importar tqdm para barra de progresso
from tqdm.notebook import tqdm

print("‚úì Fun√ß√µes de treinamento definidas")

## 8. Treinar Modelo (pode levar 45-90 minutos em CPU, 10-20 minutos em GPU)

In [None]:
print("\n" + "="*80)
print("TREINANDO MODELO LSTM")
print("="*80)

start_training = time.time()

for epoch in range(EPOCHS):
    print(f"\n{'='*80}")
    print(f"√âpoca {epoch + 1}/{EPOCHS}")
    print(f"{'='*80}")
    
    # Treinar
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device, GRADIENT_CLIP)
    
    # Validar
    val_loss, val_acc = validate(model, val_loader, criterion, device)
    
    # Guardar hist√≥rico
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    # Imprimir m√©tricas
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    
    # Learning rate scheduler
    scheduler.step(val_loss)
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        print(f"‚úì Melhor modelo salvo! Val Loss: {val_loss:.4f}")
        torch.save(model.state_dict(), 'best_lstm_model.pth')
    else:
        patience_counter += 1
        print(f"Sem melhora. Paci√™ncia: {patience_counter}/{PATIENCE}")
        
        if patience_counter >= PATIENCE:
            print(f"\n‚ö† Early stopping acionado na √©poca {epoch + 1}")
            break

elapsed_training = time.time() - start_training
print(f"\n‚úì Treinamento conclu√≠do em {elapsed_training/60:.2f} minutos")

# Carregar melhor modelo
model.load_state_dict(torch.load('best_lstm_model.pth'))
print("‚úì Melhor modelo carregado para avalia√ß√£o")

## 9. Avaliar no Conjunto de Teste

In [None]:
print("\n" + "="*80)
print("AVALIA√á√ÉO NO CONJUNTO DE TESTE")
print("="*80)

# Fazer predi√ß√µes no conjunto de teste
model.eval()
y_pred_list = []
y_prob_list = []
y_true_list = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        
        outputs = model(inputs)
        probs = F.softmax(outputs, dim=1)
        
        _, predicted = torch.max(outputs, 1)
        
        y_pred_list.extend(predicted.cpu().numpy())
        y_prob_list.extend(probs.cpu().numpy())
        y_true_list.extend(labels.numpy())

y_pred = np.array(y_pred_list)
y_prob = np.array(y_prob_list)
y_true = np.array(y_true_list)

# M√©tricas
test_accuracy = accuracy_score(y_true, y_pred)
test_precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
test_recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
test_f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

print(f"\nM√©tricas no Conjunto de Teste:")
print(f"  - Acur√°cia: {test_accuracy:.4f}")
print(f"  - Precis√£o (ponderada): {test_precision:.4f}")
print(f"  - Recall (ponderado): {test_recall:.4f}")
print(f"  - F1-Score (ponderado): {test_f1:.4f}")

# Relat√≥rio de classifica√ß√£o
print(f"\n{'='*80}")
print("RELAT√ìRIO DE CLASSIFICA√á√ÉO DETALHADO")
print(f"{'='*80}")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_, digits=4))

# Matriz de confus√£o
cm = confusion_matrix(y_true, y_pred)
print(f"\nMatriz de Confus√£o:")
print(cm)

## 10. Realizar Predi√ß√µes em Novos Dados

In [None]:
def predict_petition(text, model, vocabulary, label_encoder, device, max_len=MAX_SEQUENCE_LENGTH):
    """Fazer predi√ß√£o para um novo texto"""
    # Pr√©-processar
    processed = preprocess_text(text)
    
    # Converter para sequ√™ncia
    sequence = text_to_sequence(processed, vocabulary, max_len)
    
    # Converter para tensor
    X = torch.LongTensor([sequence]).to(device)
    
    # Fazer predi√ß√£o
    model.eval()
    with torch.no_grad():
        output = model(X)
        probs = F.softmax(output, dim=1)
        predicted_class = torch.argmax(probs, dim=1).item()
        confidence = probs[0, predicted_class].item()
    
    # Inverter labels
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
    
    return {
        'texto': text[:100] + '...' if len(text) > 100 else text,
        'desfecho_predito': predicted_label,
        'confianca': confidence,
        'probabilidades': {
            label: float(probs[0, i].item()) 
            for i, label in enumerate(label_encoder.classes_)
        }
    }

print("\n" + "="*80)
print("PREDI√á√ïES EM NOVOS DADOS")
print("="*80)

# Testar com exemplos do conjunto de teste
print("\nExemplos de predi√ß√µes do conjunto de teste:\n")

indices = np.random.choice(len(X_test), min(5, len(X_test)), replace=False)
for i, idx in enumerate(indices):
    # Recuperar texto original
    idx_original = np.where((X == X_test[idx]).all(axis=1))[0]
    if len(idx_original) > 0:
        texto_original = df['texto_peticao'].iloc[idx_original[0]]
    else:
        texto_original = "Texto n√£o encontrado"
    
    resultado_real = label_encoder.classes_[y_true[idx]]
    resultado_predito = label_encoder.classes_[y_pred[idx]]
    confianca = y_prob[idx, y_pred[idx]]
    
    print(f"Exemplo {i+1}:")
    print(f"  Texto: {texto_original[:80]}...")
    print(f"  Resultado real: {resultado_real}")
    print(f"  Resultado predito: {resultado_predito}")
    print(f"  Confian√ßa: {confianca:.4f}")
    print(f"  Status: {'‚úì Acerto' if resultado_real == resultado_predito else '‚úó Erro'}")
    print()

# Testar com novos textos
print("\nPredi√ß√µes em novos textos de exemplo:\n")

novos_textos = [
    "Recurso extraordin√°rio fundamentado em viola√ß√£o de direito constitucional com ampla jurisprud√™ncia consolidada",
    "Peti√ß√£o inicial carente de elementos essenciais e faltando documenta√ß√£o b√°sica",
    "Apela√ß√£o questionando parte da senten√ßa anterior com argumentos moderados"
]

for texto in novos_textos:
    resultado = predict_petition(texto, model, vocabulary, label_encoder, device)
    print(f"Texto: {resultado['texto']}")
    print(f"Desfecho predito: {resultado['desfecho_predito']}")
    print(f"Confian√ßa: {resultado['confianca']:.4f}")
    print(f"Probabilidades por classe:")
    for classe, prob in resultado['probabilidades'].items():
        print(f"  - {classe}: {prob:.4f}")
    print()

## 11. Visualiza√ß√µes e An√°lises

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import label_binarize

print("\n" + "="*80)
print("VISUALIZA√á√ïES E AN√ÅLISES")
print("="*80)

# 1. Gr√°fico de hist√≥rico de treinamento
plt.figure(figsize=(15, 5))

# Loss
plt.subplot(1, 3, 1)
plt.plot(history['train_loss'], label='Treino', marker='o')
plt.plot(history['val_loss'], label='Valida√ß√£o', marker='o')
plt.title('Evolu√ß√£o da Loss')
plt.xlabel('√âpoca')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

# Accuracy
plt.subplot(1, 3, 2)
plt.plot(history['train_acc'], label='Treino', marker='o')
plt.plot(history['val_acc'], label='Valida√ß√£o', marker='o')
plt.title('Evolu√ß√£o da Acur√°cia')
plt.xlabel('√âpoca')
plt.ylabel('Acur√°cia')
plt.legend()
plt.grid(True, alpha=0.3)

# F1-Score
plt.subplot(1, 3, 3)
plt.plot(history['train_f1'], label='Treino', marker='o')
plt.plot(history['val_f1'], label='Valida√ß√£o', marker='o')
plt.title('Evolu√ß√£o do F1-Score')
plt.xlabel('√âpoca')
plt.ylabel('F1-Score')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 2. Matriz de Confus√£o
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Matriz de Confus√£o - LSTM Bidirecional com Aten√ß√£o')
plt.xlabel('Predito')
plt.ylabel('Real')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

# 3. Relat√≥rio de Classifica√ß√£o Detalhado
print("\nRelat√≥rio de Classifica√ß√£o Detalhado:")
print("-" * 60)
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

# 4. Curvas ROC (para problemas multiclasse)
if len(label_encoder.classes_) > 2:
    # Binarizar labels para multiclasse
    y_true_bin = label_binarize(y_true, classes=range(len(label_encoder.classes_)))
    y_prob_bin = y_prob

    # Calcular ROC para cada classe
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(len(label_encoder.classes_)):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_prob_bin[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Plotar curvas ROC
    plt.figure(figsize=(10, 8))
    colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan']
    
    for i, color in zip(range(len(label_encoder.classes_)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'{label_encoder.classes_[i]} (AUC = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Taxa de Falsos Positivos')
    plt.ylabel('Taxa de Verdadeiros Positivos')
    plt.title('Curvas ROC - Multiclasse')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.show()

    # AUC m√©dia
    auc_mean = np.mean(list(roc_auc.values()))
    print(f"\nAUC M√©dio: {auc_mean:.4f}")

# 5. Distribui√ß√£o de probabilidades por classe
plt.figure(figsize=(12, 6))
for i, classe in enumerate(label_encoder.classes_):
    plt.subplot(2, (len(label_encoder.classes_)+1)//2, i+1)
    plt.hist(y_prob[:, i], bins=20, alpha=0.7, edgecolor='black')
    plt.title(f'Distribui√ß√£o de Probabilidades\n{classe}')
    plt.xlabel('Probabilidade')
    plt.ylabel('Frequ√™ncia')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 6. An√°lise de erros
print("\n" + "="*80)
print("AN√ÅLISE DE ERROS")
print("="*80)

# Encontrar exemplos classificados incorretamente
erros_idx = np.where(y_true != y_pred)[0]
print(f"\nTotal de erros: {len(erros_idx)} de {len(y_true)} ({len(erros_idx)/len(y_true)*100:.2f}%)")

if len(erros_idx) > 0:
    print("\nExemplos de erros de classifica√ß√£o:")
    print("-" * 60)
    
    # Mostrar alguns exemplos de erros
    for i in range(min(5, len(erros_idx))):
        idx = erros_idx[i]
        idx_original = np.where((X == X_test[idx]).all(axis=1))[0]
        if len(idx_original) > 0:
            texto_original = df['texto_peticao'].iloc[idx_original[0]]
        else:
            texto_original = "Texto n√£o encontrado"
        
        real = label_encoder.classes_[y_true[idx]]
        predito = label_encoder.classes_[y_pred[idx]]
        conf = y_prob[idx, y_pred[idx]]
        
        print(f"Erro {i+1}:")
        print(f"  Texto: {texto_original[:100]}...")
        print(f"  Real: {real} | Predito: {predito} | Confian√ßa: {conf:.4f}")
        print()

# 7. Estat√≠sticas por classe
print("\nEstat√≠sticas por Classe:")
print("-" * 40)
for i, classe in enumerate(label_encoder.classes_):
    total_classe = np.sum(y_true == i)
    acertos_classe = np.sum((y_true == i) & (y_pred == i))
    precisao_classe = acertos_classe / total_classe if total_classe > 0 else 0
    
    print(f"{classe}:")
    print(f"  Total de exemplos: {total_classe}")
    print(f"  Acertos: {acertos_classe}")
    print(f"  Precis√£o: {precisao_classe:.4f}")
    print()

## 12. Salvamento do Modelo e Resumo Final

In [None]:
import os
import json
from datetime import datetime

print("\n" + "="*80)
print("SALVAMENTO DO MODELO E RESUMO FINAL")
print("="*80)

# Criar diret√≥rio para salvar o modelo
model_dir = 'modelos_lstm'
os.makedirs(model_dir, exist_ok=True)

# Timestamp para identifica√ß√£o √∫nica
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_name = f"lstm_bidirectional_attention_{timestamp}"

# Salvar o modelo PyTorch
model_path = os.path.join(model_dir, f"{model_name}.pth")
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'epoch': len(history['train_loss']),
    'history': history,
    'config': {
        'vocab_size': VOCAB_SIZE,
        'embedding_dim': EMBEDDING_DIM,
        'hidden_dim': HIDDEN_DIM,
        'num_layers': NUM_LAYERS,
        'num_classes': len(label_encoder.classes_),
        'dropout': DROPOUT,
        'max_sequence_length': MAX_SEQUENCE_LENGTH,
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE
    }
}, model_path)

print(f"Modelo salvo em: {model_path}")

# Salvar vocabul√°rio e label encoder
vocab_path = os.path.join(model_dir, f"{model_name}_vocab.json")
with open(vocab_path, 'w', encoding='utf-8') as f:
    json.dump({
        'word_to_idx': vocabulary,
        'idx_to_word': {v: k for k, v in vocabulary.items()},
        'vocab_size': len(vocabulary)
    }, f, ensure_ascii=False, indent=2)

label_encoder_path = os.path.join(model_dir, f"{model_name}_labels.pkl")
import pickle
with open(label_encoder_path, 'wb') as f:
    pickle.dump(label_encoder, f)

print(f"Vocabul√°rio salvo em: {vocab_path}")
print(f"Label encoder salvo em: {label_encoder_path}")

# Salvar m√©tricas finais
metrics_path = os.path.join(model_dir, f"{model_name}_metrics.json")
final_metrics = {
    'timestamp': timestamp,
    'model_type': 'LSTM Bidirecional com Aten√ß√£o',
    'dataset_size': len(df),
    'train_size': len(X_train),
    'val_size': len(X_val),
    'test_size': len(X_test),
    'vocabulary_size': len(vocabulary),
    'max_sequence_length': MAX_SEQUENCE_LENGTH,
    'best_epoch': best_epoch,
    'training_time_minutes': training_time,
    'final_train_loss': history['train_loss'][-1],
    'final_val_loss': history['val_loss'][-1],
    'final_train_acc': history['train_acc'][-1],
    'final_val_acc': history['val_acc'][-1],
    'final_train_f1': history['train_f1'][-1],
    'final_val_f1': history['val_f1'][-1],
    'test_accuracy': test_accuracy,
    'test_f1_macro': test_f1_macro,
    'test_f1_weighted': test_f1_weighted,
    'classes': list(label_encoder.classes_),
    'architecture': {
        'embedding_dim': EMBEDDING_DIM,
        'hidden_dim': HIDDEN_DIM,
        'num_layers': NUM_LAYERS,
        'bidirectional': True,
        'attention': True,
        'dropout': DROPOUT
    },
    'training_config': {
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE,
        'weight_decay': WEIGHT_DECAY,
        'patience': PATIENCE,
        'max_epochs': MAX_EPOCHS,
        'gradient_clip': GRADIENT_CLIP
    }
}

with open(metrics_path, 'w', encoding='utf-8') as f:
    json.dump(final_metrics, f, ensure_ascii=False, indent=2)

print(f"M√©tricas salvas em: {metrics_path}")

# Resumo final
print("\n" + "="*80)
print("RESUMO FINAL - CLASSIFICADOR LSTM PARA PETI√á√ïES")
print("="*80)

print(f"""
üìä DESEMPENHO GERAL:
   ‚Ä¢ Acur√°cia no teste: {test_accuracy:.4f}
   ‚Ä¢ F1-Score Macro: {test_f1_macro:.4f}
   ‚Ä¢ F1-Score Ponderado: {test_f1_weighted:.4f}
   ‚Ä¢ Melhor √©poca: {best_epoch}
   ‚Ä¢ Tempo de treinamento: {training_time:.1f} minutos

üèóÔ∏è ARQUITETURA:
   ‚Ä¢ Modelo: LSTM Bidirecional com Mecanismo de Aten√ß√£o
   ‚Ä¢ Camadas LSTM: {NUM_LAYERS}
   ‚Ä¢ Unidades ocultas: {HIDDEN_DIM} (bidirecional = {HIDDEN_DIM*2} total)
   ‚Ä¢ Embedding: {EMBEDDING_DIM} dimens√µes
   ‚Ä¢ Dropout: {DROPOUT}
   ‚Ä¢ Classes: {len(label_encoder.classes_)}

üìà DADOS:
   ‚Ä¢ Tamanho do dataset: {len(df):,} amostras
   ‚Ä¢ Vocabul√°rio: {len(vocabulary):,} palavras
   ‚Ä¢ Comprimento m√°ximo: {MAX_SEQUENCE_LENGTH} tokens
   ‚Ä¢ Divis√£o: {len(X_train)} treino, {len(X_val)} valida√ß√£o, {len(X_test)} teste

üîß CONFIGURA√á√ÉO DE TREINAMENTO:
   ‚Ä¢ Otimizador: AdamW (LR={LEARNING_RATE}, Weight Decay={WEIGHT_DECAY})
   ‚Ä¢ Batch Size: {BATCH_SIZE}
   ‚Ä¢ √âpocas m√°ximas: {MAX_EPOCHS}
   ‚Ä¢ Early Stopping: Paci√™ncia de {PATIENCE} √©pocas
   ‚Ä¢ Gradient Clipping: {GRADIENT_CLIP}

üíæ ARQUIVOS SALVOS:
   ‚Ä¢ Modelo: {model_path}
   ‚Ä¢ Vocabul√°rio: {vocab_path}
   ‚Ä¢ Label Encoder: {label_encoder_path}
   ‚Ä¢ M√©tricas: {metrics_path}
""")

print("\n‚úÖ IMPLEMENTA√á√ÉO CONCLU√çDA COM SUCESSO!")
print("O modelo LSTM bidirecional com aten√ß√£o est√° pronto para classifica√ß√£o de peti√ß√µes.")
print("Use a fun√ß√£o predict_petition() para fazer predi√ß√µes em novos textos.")