In [10]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from collections import Counter
import pandas as pd

In [11]:
def load_file(path):
    """Charge un fichier texte au format 'phrase;label'"""
    texts, emotions = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                text, label = line.split(";")
                texts.append(text)
                emotions.append(label)
    return texts, emotions


In [12]:
def tokenizer(phrase):
    """Découpe une phrase en tokens (espace par défaut)"""
    return phrase.split()

def tokenizer_list(phrases):
    """Tokenise une liste de phrases"""
    tokens = []
    for phrase in phrases:
        tokens.extend(tokenizer(phrase))
    return tokens

In [13]:
class MyVocab(dict):
    """Vocabulaire personnalisé qui gère les OOV avec <unk>"""
    def __init__(self, mapping, unk_token="<unk>"):
        super().__init__(mapping)
        self.unk_token = unk_token
        self.unk_idx = mapping[unk_token]

    def __getitem__(self, token):
        return super().get(token, self.unk_idx)

    def copy(self):
        return MyVocab(dict(self), unk_token=self.unk_token)

def build_vocab(tokens, min_freq=1, specials=["<pad>", "<unk>"]):
    """Construit un vocabulaire à partir de tokens avec un seuil de fréquence"""
    counter = Counter(tokens)
    vocab = {tok: idx for idx, tok in enumerate(specials)}
    i = len(specials)
    for tok, freq in counter.items():
        if freq >= min_freq:
            vocab[tok] = i
            i += 1
    return MyVocab(vocab, unk_token="<unk>")

def rebuild_vocab(vocab_clean, unk_token="<unk>"):
    """Re-indexe le vocabulaire après nettoyage"""
    new_mapping = {word: idx for idx, word in enumerate(vocab_clean.keys())}
    return MyVocab(new_mapping, unk_token=unk_token)

In [14]:
def encode_phrases(phrases, vocab, pad_token="<pad>"):
    """Encode une liste de phrases en indices avec padding"""
    encoded = [[vocab[token] for token in tokenizer(p)] for p in phrases]
    max_len = max(len(seq) for seq in encoded)
    pad_idx = vocab[pad_token]
    tensor = torch.LongTensor([
        seq + [pad_idx] * (max_len - len(seq)) for seq in encoded
    ])
    return tensor

In [15]:
def analyze_frequencies(tensor_data, vocab):
    """Analyse statistique des fréquences des tokens"""
    from collections import Counter
    import pandas as pd

    # Aplatir tous les tokens dans une liste
    all_tokens = tensor_data.view(-1).tolist()
    counter = Counter(all_tokens)

    # Retirer le token <pad> (index 0)
    if vocab["<pad>"] in counter:
        del counter[vocab["<pad>"]]

    # Taille du vocab
    print("Taille vocabulaire:", len(vocab))

    # Moyenne d’occurrences par mot
    total = sum(counter.values())
    ratio = total / len(vocab)
    print("Moyenne occurrences/word:", ratio)

    # Stats détaillées (20e, 80e percentile, etc.)
    df = pd.Series(counter.values())
    print(df.describe(percentiles=[.2, .8]))

    return counter, df

In [16]:
def build_dataloader(tensor_data, batch_size=20, shuffle=True):
    return DataLoader(tensor_data, batch_size=batch_size, shuffle=shuffle)

def batch_to_onehot(batch, vocab_size):
    """Convertit un batch en one-hot"""
    return F.one_hot(batch, num_classes=vocab_size)

In [22]:
# 1. Charger les données
text, emotion = load_file("dataset/train.txt")

# 2. Construire vocabulaire
tokens = tokenizer_list(text)
vocab = build_vocab(tokens)


# 3. Encoder les phrases
tensor_phrases = encode_phrases(text, vocab)

# 4. Analyse statistique
counter, df_stats = analyze_frequencies(tensor_phrases, vocab)

# ==> d’après les résultats tu choisis min_freq=6
vocab_clean = build_vocab(tokens, min_freq=6)
tensor_phrases_clean = encode_phrases(text, vocab_clean)

loader = build_dataloader(tensor_phrases_clean, batch_size=20)

for batch in loader:
    print("Batch shape:", batch.shape)
    onehot = batch_to_onehot(batch, vocab_size=len(vocab_clean))
    print("One-hot shape:", onehot.shape)


Taille vocabulaire: 15214
Moyenne occurrences/word: 20.156500591560405
count    15212.000000
mean        20.159151
std        294.261709
min          1.000000
20%          1.000000
50%          1.000000
80%          6.000000
max      25859.000000
dtype: float64
Batch shape: torch.Size([20, 66])
One-hot shape: torch.Size([20, 66, 3060])
Batch shape: torch.Size([20, 66])
One-hot shape: torch.Size([20, 66, 3060])
Batch shape: torch.Size([20, 66])
One-hot shape: torch.Size([20, 66, 3060])
Batch shape: torch.Size([20, 66])
One-hot shape: torch.Size([20, 66, 3060])
Batch shape: torch.Size([20, 66])
One-hot shape: torch.Size([20, 66, 3060])
Batch shape: torch.Size([20, 66])
One-hot shape: torch.Size([20, 66, 3060])
Batch shape: torch.Size([20, 66])
One-hot shape: torch.Size([20, 66, 3060])
Batch shape: torch.Size([20, 66])
One-hot shape: torch.Size([20, 66, 3060])
Batch shape: torch.Size([20, 66])
One-hot shape: torch.Size([20, 66, 3060])
Batch shape: torch.Size([20, 66])
One-hot shape: torch