## Le Modèle GloVe

In [151]:
import numpy as np
import re
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk

# Téléchargement du tokenizer
nltk.download('punkt')

# Phrase en arabe
sentences = """الذكاء الاصطناعي هو مجال يهتم بتطوير الأنظمة التي يمكنها أداء المهام التي تتطلب عادةً الذكاء البشري.
تتضمن هذه المهام مثل التعرف على الصور، معالجة اللغة الطبيعية، والتنبؤ. يعمل الذكاء الاصطناعي على تحسين الكفاءة وجودة العمل في مختلف الصناعات."""

# Nettoyage et tokenisation
sentences_cleaned = re.sub('[^\u0621-\u064A\s]', ' ', sentences)
words = word_tokenize(sentences_cleaned)

# Créer le vocabulaire
vocab = set(words)
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

# Taille du contexte
context_size = 2

# Créer les paires de co-occurrence
co_occurrence_matrix = np.zeros((vocab_size, vocab_size))

for i in range(context_size, len(words) - context_size):
    target = words[i]
    target_idx = word_to_ix[target]
    context = [words[i - j] for j in range(1, context_size + 1)] + [words[i + j] for j in range(1, context_size + 1)]
    for ctx_word in context:
        context_idx = word_to_ix[ctx_word]
        co_occurrence_matrix[target_idx, context_idx] += 1

# GloVe - Initialisation des vecteurs
embed_dim = 10  # Dimension des vecteurs d'embedding
W = np.random.rand(vocab_size, embed_dim)  # Embedding des mots
b = np.random.rand(vocab_size)  # Bias pour chaque mot
b_context = np.random.rand(vocab_size)  # Bias pour chaque mot de contexte

# Fonction de perte pour GloVe
def glove_loss(co_occurrence_matrix, W, b, b_context, alpha=0.75, x_max=100, learning_rate=0.01):
    loss = 0
    for i in range(vocab_size):
        for j in range(vocab_size):
            if co_occurrence_matrix[i, j] > 0:
                # Poids de l'erreur
                weight = (co_occurrence_matrix[i, j] / x_max) ** alpha if co_occurrence_matrix[i, j] < x_max else 1
                # Calcul de l'erreur pour cette paire
                cost = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j])) ** 2
                loss += cost
    return loss

# Mise à jour des embeddings avec la rétropropagation
def update_embeddings(co_occurrence_matrix, W, b, b_context, learning_rate=0.01):
    global vocab_size
    for i in range(vocab_size):
        for j in range(vocab_size):
            if co_occurrence_matrix[i, j] > 0:
                # Poids de l'erreur
                weight = (co_occurrence_matrix[i, j] / 100) ** 0.75 if co_occurrence_matrix[i, j] < 100 else 1
                # Calcul des gradients
                gradient_W_i = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j])) * W[j]
                gradient_W_j = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j])) * W[i]
                gradient_b_i = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j]))
                gradient_b_context_j = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j]))

                # Mise à jour des embeddings
                W[i] -= learning_rate * gradient_W_i
                W[j] -= learning_rate * gradient_W_j
                b[i] -= learning_rate * gradient_b_i
                b_context[j] -= learning_rate * gradient_b_context_j
    return W, b, b_context

# Entraînement du modèle GloVe
epochs = 50
for epoch in range(epochs):
    loss = glove_loss(co_occurrence_matrix, W, b, b_context)
    W, b, b_context = update_embeddings(co_occurrence_matrix, W, b, b_context)
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 0, Loss: 61.78592035714478
Epoch 10, Loss: 42.33745461131349
Epoch 20, Loss: 30.833511790879324
Epoch 30, Loss: 23.423527102372766
Epoch 40, Loss: 18.3555061088821


In [152]:
from scipy.spatial.distance import cosine

# Fonction pour calculer la similarité cosinus entre deux vecteurs
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Tester la similarité entre deux mots
def test_similarity(word1, word2, word_to_ix, ix_to_word, embeddings):
    idx1 = word_to_ix[word1]
    idx2 = word_to_ix[word2]
    similarity = cosine_similarity(embeddings[idx1], embeddings[idx2])
    print(f"Similarité cosinus entre '{word1}' et '{word2}': {similarity}")

# Tester l'analogie "roi" - "homme" = "reine" - "femme"
def test_analogy(word1, word2, word3, word_to_ix, ix_to_word, embeddings):
    # Embeddings des mots
    emb1 = embeddings[word_to_ix[word1]]
    emb2 = embeddings[word_to_ix[word2]]
    emb3 = embeddings[word_to_ix[word3]]
    
    # Calcul de la direction de l'analogie
    analogy_vector = emb1 - emb2 + emb3

    # Trouver le mot le plus proche de la direction de l'analogie
    similarities = []
    for i in range(len(word_to_ix)):
        word = ix_to_word[i]
        similarity = cosine_similarity(analogy_vector, embeddings[i])
        similarities.append((word, similarity))
    
    # Trier les mots en fonction de la similarité
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Afficher les 5 mots les plus similaires
    print(f"Les mots les plus similaires à l'analogie '{word1} - {word2} + {word3}':")
    for word, similarity in similarities[:5]:
        print(f"Mot: {word}, Similarité: {similarity}")

# Phase de test
print("Test de similarité entre 'الذكاء' et 'الاصطناعي':")
test_similarity('الذكاء', 'الاصطناعي', word_to_ix, ix_to_word, W)

print("\nTest d'analogie 'الذكاء' - 'الاصطناعي' + 'يعمل':")
test_analogy('الذكاء', 'الاصطناعي', 'يعمل', word_to_ix, ix_to_word, W)


Test de similarité entre 'الذكاء' et 'الاصطناعي':
Similarité cosinus entre 'الذكاء' et 'الاصطناعي': 0.3590775039820202

Test d'analogie 'الذكاء' - 'الاصطناعي' + 'يعمل':
Les mots les plus similaires à l'analogie 'الذكاء - الاصطناعي + يعمل':
Mot: والتنبؤ, Similarité: 0.7383184306079075
Mot: الكفاءة, Similarité: 0.7197392514055865
Mot: الذكاء, Similarité: 0.7050008422013012
Mot: هذه, Similarité: 0.6789067174990286
Mot: معالجة, Similarité: 0.6575608758963057
