# üìä Dataset et Preprocessing

Pr√©parer Tiny Shakespeare pour l'entra√Ænement

In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

## Chargement du Dataset

In [None]:
# Charger le texte
with open('../data/tiny_shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Taille du dataset: {len(text):,} caract√®res")
print(f"\nPremiers 200 caract√®res:\n{text[:200]}")

## Tokenization

In [None]:
# Cr√©er un vocabulaire simple (character-level pour simplicit√©)
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(f"Vocabulaire: {vocab_size} caract√®res uniques")
print(f"Vocabulaire: {''.join(chars)}")

# Mappings
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

def encode(text):
    return [char_to_idx[ch] for ch in text]

def decode(ids):
    return ''.join([idx_to_char[i] for i in ids])

# Test
sample = "Hello World"
encoded = encode(sample)
decoded = decode(encoded)

print(f"\nOriginal: {sample}")
print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")

## Cr√©ation des S√©quences d'Entra√Ænement

In [None]:
# Encoder tout le texte
data = np.array(encode(text), dtype=np.int32)
print(f"Dataset encod√©: {len(data):,} tokens")

# Split train/val
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"Train: {len(train_data):,} tokens")
print(f"Val: {len(val_data):,} tokens")

## G√©n√©rateur de Batches

In [None]:
def get_batch(data, block_size, batch_size):
    """
    G√©n√®re un batch de s√©quences pour l'entra√Ænement.
    
    Args:
        data: Dataset complet
        block_size: Longueur de chaque s√©quence (contexte)
        batch_size: Nombre de s√©quences par batch
    
    Returns:
        x: Input sequences (batch_size, block_size)
        y: Target sequences (batch_size, block_size)
    """
    # Random starting indices
    ix = np.random.randint(0, len(data) - block_size, batch_size)
    
    # Create batches
    x = np.stack([data[i:i+block_size] for i in ix])
    y = np.stack([data[i+1:i+block_size+1] for i in ix])
    
    return x, y

# Test
block_size = 8
batch_size = 4

xb, yb = get_batch(train_data, block_size, batch_size)

print(f"Input batch shape: {xb.shape}")
print(f"Target batch shape: {yb.shape}")

# Visualiser un exemple
print(f"\nExemple de s√©quence:")
for i in range(block_size):
    context = xb[0, :i+1]
    target = yb[0, i]
    print(f"Context: {decode(context.tolist())} -> Target: {decode([target])}")

## Visualisation des Statistiques

In [None]:
# Distribution des tokens
token_counts = np.bincount(data, minlength=vocab_size)

plt.figure(figsize=(12, 4))
plt.bar(range(vocab_size), token_counts)
plt.xlabel('Token ID')
plt.ylabel('Fr√©quence')
plt.title('Distribution des Tokens dans le Dataset')
plt.xticks(range(vocab_size), chars, rotation=90)
plt.tight_layout()
plt.show()

# Top 10 tokens
top_indices = np.argsort(token_counts)[::-1][:10]
print("\nTop 10 caract√®res les plus fr√©quents:")
for idx in top_indices:
    char = chars[idx]
    count = token_counts[idx]
    print(f"  '{char}': {count:,} ({100*count/len(data):.2f}%)")