## Le Modèle CSkip-Gram

In [118]:
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## dataset

In [156]:
sentences = """الذكاء الاصطناعي هو مجال يهتم بتطوير الأنظمة التي يمكنها أداء المهام التي تتطلب عادةً الذكاء البشري.
تتضمن هذه المهام مثل التعرف على الصور، معالجة اللغة الطبيعية، والتنبؤ. يعمل الذكاء الاصطناعي على تحسين الكفاءة وجودة العمل في مختلف الصناعات."""

### Clean Data

In [120]:
# remove special characters
sentences = re.sub('[^\u0621-\u064A\s]', ' ', sentences) 

custom_stop_words = [
    "هذا", "في", "على", "هو", "هي", "من", "ما", "إلى", "أن", "عن", "و", "لا"
]
words = word_tokenize(sentences)
filtered_words = [word for word in words if word not in custom_stop_words]
# remove 1 letter words
sentences = re.sub(r'(?:^| )\w(?:$| )', ' ', sentences).strip()
sentences = " ".join(filtered_words)

In [121]:
sentences

'الذكاء الاصطناعي مجال يهتم بتطوير الأنظمة التي يمكنها أداء المهام التي تتطلب عادة الذكاء البشري تتضمن هذه المهام مثل التعرف الصور معالجة اللغة الطبيعية والتنبؤ يعمل الذكاء الاصطناعي تحسين الكفاءة وجودة العمل مختلف الصناعات'

In [122]:
sentences

'الذكاء الاصطناعي مجال يهتم بتطوير الأنظمة التي يمكنها أداء المهام التي تتطلب عادة الذكاء البشري تتضمن هذه المهام مثل التعرف الصور معالجة اللغة الطبيعية والتنبؤ يعمل الذكاء الاصطناعي تحسين الكفاءة وجودة العمل مختلف الصناعات'

## vocabulary

In [123]:
words = sentences.split()
vocab = set(words)

In [157]:
#tokens = ' '.join(vocab)
vocab = word_tokenize(sentences)
print(vocab)

['الذكاء', 'الاصطناعي', 'هو', 'مجال', 'يهتم', 'بتطوير', 'الأنظمة', 'التي', 'يمكنها', 'أداء', 'المهام', 'التي', 'تتطلب', 'عادةً', 'الذكاء', 'البشري', '.', 'تتضمن', 'هذه', 'المهام', 'مثل', 'التعرف', 'على', 'الصور،', 'معالجة', 'اللغة', 'الطبيعية،', 'والتنبؤ', '.', 'يعمل', 'الذكاء', 'الاصطناعي', 'على', 'تحسين', 'الكفاءة', 'وجودة', 'العمل', 'في', 'مختلف', 'الصناعات', '.']


### Dictionaries

In [125]:
id_to_word = {i:x for (i, x) in enumerate(vocab)}
word_to_id = {x:i for (i, x) in enumerate(vocab)}

### Skip-Gram Paires

In [126]:
def generate_training_data(vocab, word_to_id, window_size):
    X, Y = [], []

    for i in range(len(vocab)):
        nbr_inds = list(range(max(0, i - window_size), i)) + \
                   list(range(i + 1, min(len(vocab), i + window_size + 1)))
        for j in nbr_inds:
            X.append(word_to_id[vocab[i]])
            Y.append(word_to_id[vocab[j]])
            
    return np.array(X), np.array(Y)


### Embeddings

In [127]:
def expand_dims(x, y):
    x = np.expand_dims(x, axis=0)
    y = np.expand_dims(y, axis=0)
    return x, y

In [128]:
x, y = generate_training_data(vocab, word_to_id, 3)
x, y = expand_dims(x, y)

In [129]:
# generated training data
x, y

(array([[ 0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,  3,  3,
          3,  3,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,
          6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,
          9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11,
         11, 11, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 14, 14,
         14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
         17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
         19, 19, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 22, 22,
         22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24,
         25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 27, 27, 27, 27, 28,
         28, 28]]),
 array([[ 1,  2,  3,  0,  2,  3,  4,  0,  1,  3,  4,  5,  0,  1,  2,  4,
          5,  6,  1,  2,  3,  5,  6,  7,  2,  3,  4,  6,  7,  8,  3,  4,
          5,  7,  8,  9,  4,  5,  6,  8,  9, 10,  5,  6,  7,  9, 10, 11,
          6,  7,  8, 10, 11, 12

### Forward function

In [130]:
def init_parameters(vocab_size, emb_size):
    wrd_emb = np.random.randn(vocab_size, emb_size) * 0.01
    w = np.random.randn(vocab_size, emb_size) * 0.01
    
    return wrd_emb, w


In [131]:
def softmax(z):
    return np.divide(np.exp(z), np.sum(np.exp(z), axis=0, keepdims=True) + 0.001)

In [132]:
def forward(inds, params):
    wrd_emb, w = params
    word_vec = wrd_emb[inds.flatten(), :].T
    z = np.dot(w, word_vec)
    out = softmax(z)
    
    cache = inds, word_vec, w, z
    
    return out, cache

### Cost Function

In [133]:
def cross_entropy(y, y_hat):
    m = y.shape[1]
    cost = -(1 / m) * np.sum(np.sum(y_hat * np.log(y + 0.001), axis=0, keepdims=True), axis=1)
    return cost

### Backward function

In [134]:
def dsoftmax(y, out):
    dl_dz = out - y
    
    return dl_dz

In [135]:
def backward(y, out, cache):
    inds, word_vec, w, z = cache
    wrd_emb, w = params
    
    dl_dz = dsoftmax(y, out)
    # deviding by the word_vec length to find the average
    dl_dw = (1/word_vec.shape[1]) * np.dot(dl_dz, word_vec.T)
    dl_dword_vec = np.dot(w.T, dl_dz)
    
    return dl_dz, dl_dw, dl_dword_vec

In [136]:
def update(params, cache, grads, lr=0.03):
    inds, word_vec, w, z = cache
    wrd_emb, w = params
    dl_dz, dl_dw, dl_dword_vec = grads
    
    wrd_emb[inds.flatten(), :] -= dl_dword_vec.T * lr
    w -= dl_dw * lr
    
    return wrd_emb, w

## training

In [137]:
vocab_size = len(id_to_word)

m = y.shape[1]
y_one_hot = np.zeros((vocab_size, m))
y_one_hot[y.flatten(), np.arange(m)] = 1

y = y_one_hot

In [138]:
batch_size=256
embed_size = 50

params = init_parameters(vocab_size, 50)

costs = []

for epoch in range(5000):
    epoch_cost = 0
    
    batch_inds = list(range(0, x.shape[1], batch_size))
    np.random.shuffle(batch_inds)
    
    for i in batch_inds:
        x_batch = x[:, i:i+batch_size]
        y_batch = y[:, i:i+batch_size]
        
        pred, cache = forward(x_batch, params)
        grads = backward(y_batch, pred, cache)
        params = update(params, cache, grads, 0.03)
        cost = cross_entropy(pred, y_batch)
        
        epoch_cost += np.squeeze(cost)
        
    costs.append(epoch_cost)
    
    if(epoch % 250 == 0):
        print("Cost after epoch {}: {}".format(epoch, epoch_cost))

Cost after epoch 0: 3.3387545731034867
Cost after epoch 250: 3.324491434962342
Cost after epoch 500: 3.1907104986550623
Cost after epoch 750: 2.6007879630678463
Cost after epoch 1000: 2.195101479381379
Cost after epoch 1250: 2.063383359524632
Cost after epoch 1500: 2.0151714848607685
Cost after epoch 1750: 2.00231486554247
Cost after epoch 2000: 1.9944340741314
Cost after epoch 2250: 1.9830613630793088
Cost after epoch 2500: 1.9719825976193845
Cost after epoch 2750: 1.9636453984583393
Cost after epoch 3000: 1.957210720130514
Cost after epoch 3250: 1.9515369745540534
Cost after epoch 3500: 1.9464036370883289
Cost after epoch 3750: 1.945096073589045
Cost after epoch 4000: 1.9485926019161899
Cost after epoch 4250: 1.9536560378262717
Cost after epoch 4500: 1.9598958417104668
Cost after epoch 4750: 1.9674673852935638


### Predict function

In [None]:
x_test = np.arange(vocab_size)
x_test = np.expand_dims(x_test, axis=0)
softmax_test, _ = forward(x_test, params)
top_sorted_inds = np.argsort(softmax_test, axis=0)[-4:,:]

In [150]:
input_word = 'الذكاء'
input_ind = word_to_id[input_word]
output_words = [id_to_word[output_ind] for output_ind in top_sorted_inds[::-1, input_ind]]
print("{}'s skip-grams: {}".format(input_word, output_words))

الذكاء's skip-grams: ['التعرف', 'البشري', 'معالجة', 'عادة']


## Le Modèle GloVe

In [151]:
import numpy as np
import re
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk

# Téléchargement du tokenizer
nltk.download('punkt')

# Phrase en arabe
sentences = """الذكاء الاصطناعي هو مجال يهتم بتطوير الأنظمة التي يمكنها أداء المهام التي تتطلب عادةً الذكاء البشري.
تتضمن هذه المهام مثل التعرف على الصور، معالجة اللغة الطبيعية، والتنبؤ. يعمل الذكاء الاصطناعي على تحسين الكفاءة وجودة العمل في مختلف الصناعات."""

# Nettoyage et tokenisation
sentences_cleaned = re.sub('[^\u0621-\u064A\s]', ' ', sentences)
words = word_tokenize(sentences_cleaned)

# Créer le vocabulaire
vocab = set(words)
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

# Taille du contexte
context_size = 2

# Créer les paires de co-occurrence
co_occurrence_matrix = np.zeros((vocab_size, vocab_size))

for i in range(context_size, len(words) - context_size):
    target = words[i]
    target_idx = word_to_ix[target]
    context = [words[i - j] for j in range(1, context_size + 1)] + [words[i + j] for j in range(1, context_size + 1)]
    for ctx_word in context:
        context_idx = word_to_ix[ctx_word]
        co_occurrence_matrix[target_idx, context_idx] += 1

# GloVe - Initialisation des vecteurs
embed_dim = 10  # Dimension des vecteurs d'embedding
W = np.random.rand(vocab_size, embed_dim)  # Embedding des mots
b = np.random.rand(vocab_size)  # Bias pour chaque mot
b_context = np.random.rand(vocab_size)  # Bias pour chaque mot de contexte

# Fonction de perte pour GloVe
def glove_loss(co_occurrence_matrix, W, b, b_context, alpha=0.75, x_max=100, learning_rate=0.01):
    loss = 0
    for i in range(vocab_size):
        for j in range(vocab_size):
            if co_occurrence_matrix[i, j] > 0:
                # Poids de l'erreur
                weight = (co_occurrence_matrix[i, j] / x_max) ** alpha if co_occurrence_matrix[i, j] < x_max else 1
                # Calcul de l'erreur pour cette paire
                cost = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j])) ** 2
                loss += cost
    return loss

# Mise à jour des embeddings avec la rétropropagation
def update_embeddings(co_occurrence_matrix, W, b, b_context, learning_rate=0.01):
    global vocab_size
    for i in range(vocab_size):
        for j in range(vocab_size):
            if co_occurrence_matrix[i, j] > 0:
                # Poids de l'erreur
                weight = (co_occurrence_matrix[i, j] / 100) ** 0.75 if co_occurrence_matrix[i, j] < 100 else 1
                # Calcul des gradients
                gradient_W_i = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j])) * W[j]
                gradient_W_j = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j])) * W[i]
                gradient_b_i = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j]))
                gradient_b_context_j = weight * (np.dot(W[i], W[j]) + b[i] + b_context[j] - np.log(co_occurrence_matrix[i, j]))

                # Mise à jour des embeddings
                W[i] -= learning_rate * gradient_W_i
                W[j] -= learning_rate * gradient_W_j
                b[i] -= learning_rate * gradient_b_i
                b_context[j] -= learning_rate * gradient_b_context_j
    return W, b, b_context

# Entraînement du modèle GloVe
epochs = 50
for epoch in range(epochs):
    loss = glove_loss(co_occurrence_matrix, W, b, b_context)
    W, b, b_context = update_embeddings(co_occurrence_matrix, W, b, b_context)
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 0, Loss: 61.78592035714478
Epoch 10, Loss: 42.33745461131349
Epoch 20, Loss: 30.833511790879324
Epoch 30, Loss: 23.423527102372766
Epoch 40, Loss: 18.3555061088821


In [152]:
from scipy.spatial.distance import cosine

# Fonction pour calculer la similarité cosinus entre deux vecteurs
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Tester la similarité entre deux mots
def test_similarity(word1, word2, word_to_ix, ix_to_word, embeddings):
    idx1 = word_to_ix[word1]
    idx2 = word_to_ix[word2]
    similarity = cosine_similarity(embeddings[idx1], embeddings[idx2])
    print(f"Similarité cosinus entre '{word1}' et '{word2}': {similarity}")

# Tester l'analogie "roi" - "homme" = "reine" - "femme"
def test_analogy(word1, word2, word3, word_to_ix, ix_to_word, embeddings):
    # Embeddings des mots
    emb1 = embeddings[word_to_ix[word1]]
    emb2 = embeddings[word_to_ix[word2]]
    emb3 = embeddings[word_to_ix[word3]]
    
    # Calcul de la direction de l'analogie
    analogy_vector = emb1 - emb2 + emb3

    # Trouver le mot le plus proche de la direction de l'analogie
    similarities = []
    for i in range(len(word_to_ix)):
        word = ix_to_word[i]
        similarity = cosine_similarity(analogy_vector, embeddings[i])
        similarities.append((word, similarity))
    
    # Trier les mots en fonction de la similarité
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Afficher les 5 mots les plus similaires
    print(f"Les mots les plus similaires à l'analogie '{word1} - {word2} + {word3}':")
    for word, similarity in similarities[:5]:
        print(f"Mot: {word}, Similarité: {similarity}")

# Phase de test
print("Test de similarité entre 'الذكاء' et 'الاصطناعي':")
test_similarity('الذكاء', 'الاصطناعي', word_to_ix, ix_to_word, W)

print("\nTest d'analogie 'الذكاء' - 'الاصطناعي' + 'يعمل':")
test_analogy('الذكاء', 'الاصطناعي', 'يعمل', word_to_ix, ix_to_word, W)


Test de similarité entre 'الذكاء' et 'الاصطناعي':
Similarité cosinus entre 'الذكاء' et 'الاصطناعي': 0.3590775039820202

Test d'analogie 'الذكاء' - 'الاصطناعي' + 'يعمل':
Les mots les plus similaires à l'analogie 'الذكاء - الاصطناعي + يعمل':
Mot: والتنبؤ, Similarité: 0.7383184306079075
Mot: الكفاءة, Similarité: 0.7197392514055865
Mot: الذكاء, Similarité: 0.7050008422013012
Mot: هذه, Similarité: 0.6789067174990286
Mot: معالجة, Similarité: 0.6575608758963057
