## TP N°2

##### Ce travail est fait par AFKIR Mohamed et AKKOUH Lokmane

In [1]:
import numpy as np
import pandas as pd
import csv
import conllu
import re
from collections import defaultdict
import nltk
import json
import string
from help.emo_unicode import UNICODE_EMOJI_ALIAS as EMOTICONS ,UNICODE_EMOJI as UNICODE_EMO
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lokma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## I- Première Partie

#### La méthode read_file(infile)

In [2]:
def read_file(infile) -> str:
    with open(infile, 'r', encoding='utf-8') as file:
        corpus = file.read()
    return corpus

#### La méthode vocabulary_size(infile) 

In [3]:
def vocabulary_size(file) -> int:
    words = read_file(file).split(' ')
    return len(set(words)) # La longueur de vocabulaire compris les symboles <s> et </s> car seront eventuellement pris en compte dans les calculs des probabilitées

In [4]:
# print(vocabulary_size("generated_files/ngramv1_train_processed_bigram.txt"))

#### La méthode len_of_sentence(sentence)

In [5]:
def len_of_sentence(sentence, corpus) -> int:
    sentence = prepare_sentence(sentence, corpus)
    return len(sentence)

In [6]:
# print(len_of_sentence("hey, how are u !!", "data/ngramv1.train")) #output = 7

#### La méthode read_proba_file()

In [7]:
def read_proba_file(file):
    with open(file, 'r') as f:
        data = json.load(f)
    return data

In [8]:
# print(read_proba_file("generated_files/ngramv1_train_bigram_probas.json"))

#### La méthode removes_urls_and_emails(text)

In [9]:
def remove_urls_and_emails(text):
  url_pattern = r'(?:https?://)?(?:www\.)?\S+\.([A-Za-z]|[A-Za-z0-9._%+-/])*'
  clean_text = re.sub(url_pattern, '', text)  
  email_pattern = r'\b[A-Za-z0-9._%+-]+@([A-Za-z]|[A-Za-z0-9._%+-/])*'
  clean_text = re.sub(email_pattern, '', clean_text)  
  return clean_text


#### La méthode convert_emoticons(text)

In [10]:
def convert_emoticons(comment):
    for emot1 in EMOTICONS:
        pattern = re.escape(emot1)
        comment = re.sub(pattern, ' emo_'+EMOTICONS[emot1][1:-1]+'_emo', comment)
    for emot2 in UNICODE_EMO:
        pattern = re.escape(emot2[0])
        comment = re.sub(pattern, ' emo_'+re.sub(',','middcommamidd',re.sub(' ','_',emot2[1]))+'_emo', comment)
    return str(comment)


#### La methode convert_text_2_emoji()

In [11]:
def convert_text_2_emoji(comment):
    for emot1 in EMOTICONS:
        pattern = re.escape(emot1)
        comment = re.sub(' emo_'+EMOTICONS[emot1][1:-1]+'_emo',pattern, comment)
    for emot2 in UNICODE_EMO:
        pattern = re.escape(emot2[0])
        comment = re.sub(' emo_'+re.sub(',','middcommamidd',re.sub(' ','_',emot2[1]).lower())+'_emo',pattern, comment)
    return str(comment)


#### La methode separate_emojies 

In [12]:
def separate_emojies(comment):
    for emot1 in EMOTICONS:
        pattern = re.escape(emot1)
        comment = re.sub(pattern, ' '+emot1+' ', comment)
    for emot2 in UNICODE_EMO:
        pattern = re.escape(emot2[0])
        comment = re.sub(pattern, ' '+str(emot2[0])+' ', comment)
    return str(comment)


#### La méthode add_space()

In [13]:
def add_space(text):
  """Adds spaces around punctuations (except ', -, _, #') in a text.

  Args:
      text: The input text string.

  Returns:
      The modified text with spaces around punctuations.
  """
  text=convert_emoticons(text)
  text = text.lower()
  sentences = text.split("\n")
  punctuations = set(string.punctuation)
  updated_sentences = []

  for sentence in sentences:
    updated_sentence = ''
    for char in sentence:
      if char in punctuations and char not in ["'", "_", "-", "#"]:
        updated_sentence += ' ' + char + ' '
      else:
        updated_sentence += char
    updated_sentence = re.sub(r'#(\d+)', lambda match: '# ' + match.group(1), updated_sentence)
    updated_sentences.append(updated_sentence)

  updated_text = '\n'.join(updated_sentences)
  updated_text=convert_text_2_emoji(updated_text)
  return updated_text


#### La méthode prepare_sentence(infile) 

In [14]:
def prepare_sentence(sentence, corpus, ngram_size = 2) -> str:
    sentence = sentence.lower()
    sentence = add_space(sentence) 
    sentence = remove_urls_and_emails(sentence)
    sentence = separate_emojies(sentence)
    
    corpus = read_file(corpus)    
    if ngram_size == 2:
        sentence = ['<s> ' + sentence + ' </s>' ]
    elif ngram_size == 3:
        sentence = ['<s> <s> ' + sentence + ' </s>' ] # Pour le tri-gramme on ajoute deux <s><s> car on voit deux mots avant le mot cible

    word_counts = defaultdict(int)
    sentence = sentence[0].split()

    for word in sentence:
        word_counts[word] += 1
    for i in  range(len(sentence)):
        if (sentence[i] not in corpus):
            sentence[i] = '<UNK>'
    return ' '.join(sentence).strip().split(" ")

In [15]:
# print(prepare_sentence("i am ahmed .", "generated_files/part1/ngramv1_test_processed_bigram.txt", ngram_size = 3))

#### La méthode prepare_data(infile, ngram_size=2)

In [16]:
def prepare_data(text,  ngram_size = 2, unk_threshold = 3 ) -> str:
    """ 
    This method takes as input a text file representing the corpus 
    normalizes the text, and adds start  and end sentence tokens 
     Remember that you have to add a special '<s>' token to the beginning
    and '</s>' token to the end of each sentence to correctly estimate the bigram
    probabilities. 
    
    Remember that you have to add a special '<s><s>' token to the beginning
    and '</s>' token to the end of each sentence to correctly estimate the trigram
    probabilities. 
    
    This method also handles out-of-vocabulary words (tokens).
    To achieve this, the method searches for words that appear less than N times in the training data and replaces them with <UNK>.
    
    Parameters
    les_noms_des_tagsles_noms_des_tagsles_noms_des_tagsles_noms_des_tagsles_noms_des_tags
    infile : str 
        File path to the training corpus.
    
    ngram_size : int
        specifying which model to use. 
        Use this variable in an if/else statement. (n=2 for bigram and n=3 for trigram)
    
    Returns
    les_noms_des_tagsles_noms_des_tagsles_noms_des_tags-
    the preprocessed corpus
    
    """
    text = add_space(text) 
    text = remove_urls_and_emails(text)
    text = separate_emojies(text)
    
    # text = convert_emoticons(text)
    sentences = text.split("\n")

    for i in range(len(sentences)):
        if ngram_size == 2:
            sentences[i] = ['<s> ' + sentences[i] + ' </s>' ]
        elif ngram_size == 3:
            sentences[i] = ['<s> <s> ' + sentences[i] + ' </s>' ] # Pour le tri-gramme on ajoute deux <s><s> car on voit deux mots avant le mot cible
    sentences=' '.join( [word for sublist in sentences for word in sublist])

    word_counts = defaultdict(int) # Pour compter les occurrences des mots
    sentences = sentences.strip().split() 
    for word in sentences:
        word_counts[word] += 1
    for i, sentence in enumerate(sentences):
        tokens = sentence.split()
        sentences[i] = ' '.join(['<UNK>' if word_counts[word] < unk_threshold else word for word in tokens])
    return ' '.join(sentences)
    

In [17]:
# # Les fichiers de partie 1
# processed_text = prepare_data(read_file("data/part1/ngramv1.train"), ngram_size = 2, unk_threshold=3)
# with open("generated_files/part1/ngramv1_train_processed_bigram.txt", "w", encoding="utf-8") as file:
#     file.write(processed_text)

# processed_text = prepare_data(read_file("data/part1/ngramv1.train"), ngram_size = 3, unk_threshold=3)
# with open("generated_files/part1/ngramv1_train_processed_trigram.txt", "w") as file:
#     file.write(processed_text)


# # Les fichiers de parie 2

# processed_text = prepare_data(read_file('data/part2/big_data.txt'), ngram_size = 3, unk_threshold=3)
# with open("generated_files/part2/big_data_processed_trigram.txt", "w", encoding="utf-8") as file:
#     file.write(processed_text)

# processed_text = prepare_data(read_file('data/part2/big_data.txt'), ngram_size = 2, unk_threshold=3)
# with open("generated_files/part2/big_data_processed_bigram.txt", "w", encoding="utf-8") as file:
#     file.write(processed_text)

#### La méthode Méthode train(ngram_size=2, infile)

In [18]:
def train(infile, ngram_size = 2, k = 0.01) -> dict:
    """his method takes as input a text processed file representing the corpus
       In addition of ngram_size and smoothinh factor k
       Returns: a dictionnary of probability
    """
    
    data = read_file(infile)
    data = data.strip().split(" </s> ")
    processed_data = []
    for line in data[:-1]:
        processed_data.append(line + " </s>")  
    processed_data.append(data[-1]) 
    
    #Initialiser les dictionnaires de probabilités
    bigram_counts = defaultdict(int) 
    trigram_counts = defaultdict(int)
    trigram_probs = defaultdict(int)
    bigram_probs = defaultdict(int)
    
    # Initialiser un dictionnaire pour compter de mots uniques
    global word_counts 
    word_counts = defaultdict(int)
    
    # Pour calculer les occurrences des couple de mots
    global word_counts1 
    word_counts1 = defaultdict(int) 

    # Récuperer la taille de vocabulaire
    vocab_size = vocabulary_size(infile)
    if ngram_size == 2:
    # calculez les probabilités de bigrammes avec le lissage add-k
        for sentence in processed_data:  
            words = sentence.strip().split(" ")  
            for i in range(len(words) - 1):
                pre_word, word = words[i], words[i + 1]
                bigram_counts[(pre_word, word)] += 1
            for w in words:
                word_counts[w] += 1

        for (pre_word, word) in list(bigram_counts.keys()):
            count = bigram_counts[(pre_word, word)]
            bigram_probs[(pre_word, word)] = (count + k) / (word_counts[pre_word] + vocab_size * k)

        for (pre_word, word) in list(bigram_counts.keys()):
                        prob = bigram_probs[(pre_word, word)]
                        bigram_probs[(pre_word, word)] = np.log(prob) 
        return bigram_probs
        
    elif ngram_size == 3:
        for sentence in processed_data:
            words = sentence.strip().split(" ")
            for i in range(len(words) - 2):
                pre_pre_word, pre_word, word = words[i], words[i + 1], words[i + 2]
                trigram_counts[(pre_pre_word, pre_word, word)] += 1

            for i in range(len(words)-1):
                word_counts1[(words[i], words[i+1])] += 1
        
        # Calculer les probabilités du trigramme avec le lissage add-k
        for (pre_pre_word, pre_word, word) in list(trigram_counts.keys()):
            count = trigram_counts[(pre_pre_word, pre_word, word)]
            trigram_probs[(pre_pre_word, pre_word, word)] = (count + k) / (word_counts1[(pre_pre_word, pre_word)] + vocab_size * k)

        # Convertir les probabilités en log de probabilités
        for (pre_pre_word, pre_word, word) in list(trigram_counts.keys()):
            prob =  trigram_probs[(pre_pre_word, pre_word, word)]
            trigram_probs[(pre_pre_word, pre_word, word)] = np.log(prob) 
        return trigram_probs




In [19]:
# # Les fichiers de partie 1
# result1  = train("generated_files/part1/ngramv1_train_processed_trigram.txt", ngram_size = 3, k=0.01)
# # Convert tuple keys to string keys ( car on peut pas stocker les les tuples commes des clés dans un fichier .JSON)
# result_str_keys = {str(key): value for key, value in result1.items()}
# with open("generated_files/part1/ngramv1_train_trigram_probas.json", "w") as json_file:
#     json.dump(result_str_keys, json_file)


# result1  = train("generated_files/part1/ngramv1_train_processed_bigram.txt", ngram_size = 2, k=0.01)
# # Convert tuple keys to string keys
# result_str_keys2 = {str(key): value for key, value in result1.items()}
# with open("generated_files/part1/ngramv1_train_bigram_probas.json", "w") as json_file:
#     json.dump(result_str_keys2, json_file)



# # Les fichiers de partie 2
# result1  = train("generated_files/part2/big_data_processed_trigram.txt", ngram_size = 3, k=0.01)
# # Convert tuple keys to string keys
# result_str_keys = {str(key): value for key, value in result1.items()}
# with open("generated_files/part2/big_data_trigram.json", "w") as json_file:
#     json.dump(result_str_keys, json_file)

# result1  = train("generated_files/part2/big_data_processed_bigram.txt", ngram_size = 2, k=0.01)
# result_str_keys = {str(key): value for key, value in result1.items()}
# with open("generated_files/part2/big_data_bigram.json", "w") as json_file:
#     json.dump(result_str_keys, json_file)


#### La méthode predict_ngram(sentence, ngram_size = 2)

In [20]:
def predict_ngram(sentence, file, proba_file, ngram_size = 3, k = 0.01) -> float:
    """
    Predicts the probability of a sentence using bigram or trigram model.

    This function takes a sentence as input, preprocesses it using the prepare_sentence function,
    and then calculates the probability of the sentence using the bigram or trigram probabilities
    depending on the ngram_size parameter.

    Args:
        sentence: The sentence to predict the probability for (string).
        file: file to get probabities.
        ngram_size: Specifies the model to use (2 for bigram, 3 for trigram).
        k : smoothing parameter.

    Returns:
        float: The log probability of the sentence.
    """
    sentence  = prepare_sentence(sentence, file, ngram_size)
    vocab_size = vocabulary_size(file)
     
    proba = 0.0
    probas = read_proba_file(proba_file)

    if ngram_size ==  2:
        for j in range(len(sentence) - 1):
            out = 0
            pre_word, word = sentence[j], sentence[j+1] 
            for i in probas.keys():
                l = i[2:-2]
                l = l.split("', '")                
                if (pre_word, word) == (l[0],l[1]):
                    proba += probas[i]
                    out = 1
            if out == 1:
                continue
            else:
                proba += np.log(k / (word_counts[pre_word] + vocab_size * k))

            

    elif ngram_size == 3:
        for j in range(len(sentence) - 2):
            pre_pre_word, pre_word, word = sentence[j], sentence[j+1], sentence[j+2]  
            out = 0
            for i in probas.keys():
                l = i[2:-2]
                l = l.split("', '")                
                if (pre_pre_word,pre_word, word) == (l[0],l[1],l[2]):
                    proba += probas[i]
                    out = 1
            if out == 1:
                continue
            else:
                proba += np.log(k / (word_counts1[(pre_pre_word, pre_word)] + vocab_size * k))                
    return proba


In [21]:
# print(predict_ngram("i am ahmed ", "generated_files/part1/ngramv1_train_processed_bigram.txt","generated_files/part1/ngramv1_train_trigram_probas.json", ngram_size = 3))

#### La méthode test_perplexity(test_file, ngram_size = 2)

In [22]:
def test_perplexity(test_file, corpus, proba_file, ngram_size = 2) -> float:
    """Calculate the perplexity of the trained LM on a test corpus.

        This seems complicated, but is actually quite simple. 

        First we need to calculate the total probability of the test corpus. 
        We can do this by summing the log probabiities of each sentence in the corpus.
        
        Then we need to normalize (e.g., divide) this summed log probability by the 
        total number of tokens in the test corpus. The one tricky bit here is we need
        to augment this count of the total number of tokens by one for each sentence,
        since we're including the sentence-end token in these probability estimates.

        Finally, to convert this result back to a perplexity, we need to multiply it
        by negative one, and exponentiate it - e.g., if we have the result of the above
        in a variable called 'val', we will return math.exp(val). 

        Parameters
        les_noms_des_tagsles_noms_des_tagsles_noms_des_tags-
        test_file : str
            File path to a test corpus.
            (assumed pre-tokenized, whitespace-separated, one line per sentence)

        ngram_size : int
            specifying which model to use. 
            Use this variable in an if/else statement. (n=2 for bigram and n=3 for trigram)

        Returns
        les_noms_des_tagsles_noms_des_tagsles_noms_des_tags-
        float  
            The perplexity of the corpus (normalized total log probability).
        """
    text = read_file(test_file)
    sentences = text.split("\n")
    tokens = prepare_data(text, ngram_size)
    tokens = tokens.split(" ")
    
    num_tokens = sum(1 for token in tokens if token != "<s>") # Nombre des tokens ( y compris le nombre de </s>)
    
    total_perplexity = 0.0
    for sentence in sentences:
        log_sentence_proba = -predict_ngram(sentence, corpus, proba_file, ngram_size)
        total_perplexity += log_sentence_proba
    return np.exp(total_perplexity / num_tokens)

In [23]:
# print(test_perplexity("data/part1/ngramv1.test", "generated_files/part1/ngramv1_train_processed_trigram.txt", "generated_files/part1/ngramv1_train_trigram_probas.json", ngram_size = 3))

## I- Deuxième Partie

#### La méthode Méthode generateText() 

In [24]:
global corpus
corpus = "data/part2/big_data.txt" # On déclare à ce moment le corpus utilisé une fois pour toute
    

In [25]:
def generateText(proba_file, ngram_size = 2) -> str:
    """
    Generates text using the trained bigram and trigram models.
    
    This method starts with a seed sentence (<s>) and iteratively samples the next word based on the probabilities
    learned from the n-gram model. The process continues until the end-of-sentence token (</s>) is generated.
    
    Args:
      proba_file : File that contains the ngram probabilities
      ngram_size: Integer specifying the n-gram model to use (2 for bigram, 3 for trigram).
      
    
    Returns:
      str: The generated text.
    """
    

    sentence = ["<s>"] * (ngram_size - 1)
    probas = read_proba_file(proba_file)
    while sentence[-1] != "</s>":
      possible_words = []
      probs = []
      for tuple in probas.keys():
        words = tuple[2:-2]
        words = words.split("', '")      
        if words[ : ngram_size - 1] == sentence[-ngram_size + 1:]:
          if (words[- 1] in ['<UNK>']):
              pass
          else:
              probs.append(probas[tuple])
              possible_words.append((probas[(tuple)], words[-1]))
      

      if (len(sentence)==(ngram_size-1)): # pour choisir un mot aleatoire au debut
          probs = sorted(probs, reverse = True)[:20] # On donne la chance aux vingt  tokens les plus probables d'etre au debut de la phrase
          f_proba=(np.random.choice(probs))
          for p in possible_words :
            if p[0] == f_proba :
              sugg = p[1]
          sentence.append(sugg)
          continue
      try:
          probs = sorted(probs, reverse = True)[:5]
          probability = np.random.choice(probs, size = 1)
          for p in possible_words :
              if p[0] == probability :
                  sugg = p[1]
          sentence.append(sugg)
      except:
        sentence.append('.')
    return " ".join(sentence[ngram_size-1:-1])  # Supprimer les jetons de début et de fin de phrase



In [26]:
# for i in range(5)=:
print(generateText("generated_files/part2/big_data_bigram.json", ngram_size = 2))
print("___")
# for i in range(5):
print(generateText("generated_files/part2/big_data_trigram.json", ngram_size = 3))

just saw your heart attack on a lot . you are so i have been a lot , but i love that i have the world .
___
thanks to for a great day to my new favorite song ! barbie . . i love the new one in the morning show a fake id , his downfall ? wearing a " closer " then what ? !


#### La méthode autoComplete(text)

In [27]:
def autoComplete(sentence, proba_file,ngram_size = 2) -> str:
    """
    Predicts next token based on previous ones.
    
    Args:
      sentence: represents previous tokens
      proba_file : File that contains the ngram probabilities
    
    Returns:
      str: The predicted token plus previous sentence.
    """
    
    sentence = sentence.lower()
    probas = read_proba_file(proba_file)
    possible_words = []
    probs = []
    sentence = prepare_sentence(sentence,corpus)[1:-1]
    
    for tupl in probas.keys():
        words = tupl[2:-2]
        words = words.split("', '")
        for i in range(ngram_size - 1):
            if words[ : ngram_size - 1] == sentence[-ngram_size + 1:]:
                if (words[- 1] == '<UNK>' or words[- 1] == '</s>' ):
                    pass
                    
                else:
                    probs.append(probas[tupl])
                    possible_words.append((probas[(tupl)], words[-1]))
    if probs:
        probs = sorted(probs, reverse = True)[:3]
        f_proba=(np.random.choice(probs))
        for p in possible_words : 
            if p[0] == f_proba :
                suggestion = p[1]
        sentence.append(suggestion)  
        
    return " ".join(sentence)

In [28]:
# print(autoComplete("he is here ",proba_file='generated_files/part2/big_data_bigram.json'))

#### La méthode correction()

In [29]:
from help.NLP_utils import candidates, get_vocabulary

In [30]:
def correction(twput, proba_file, ngram_size = 2) -> str :
    """
    Takes a text and corrects it.
    
    Args:
      twput : Text with probable uncorrected tokens
      proba_file : File that contains the ngram probabilities
      ngram_size: Integer specifying the n-gram model to use (2 for bigram, 3 for trigram).
      
    
    Returns:
      str: The generated text.
    """    
    probas = read_proba_file(proba_file)
    C_words_List=get_vocabulary(read_file(corpus))                    
    C_words=read_file(corpus)
    probs=[]                                       
    suggestions=[]  
    twput = twput.lower()
    twput = re.sub(r'[^\w]',' ', twput) 
    twput = re.findall(r'\w+', twput) 
    for i in range(ngram_size - 1):
        twput.insert(i, "<s>")
    twput.append("</s>") # "I like football" les_noms_des_tags-> ['<s>', '<s>', 'I', 'like', 'football', '</s>'] if ngram_size = 3
    for i in range(len(twput)):
        probs=[]                                        #list of probabilities
        suggestions=[]                                  #list of suggestions that are in our vocabulary and are provided by candidates
        possible_words = []                             #list of words returned by candidates cotaining possible words after min edit
        if twput[i] in ["<s>", "</s>"] or twput[i] in C_words_List :
            pass
        else:
            possible_words.append(candidates(twput[i],corpus=C_words))
            for sugg in possible_words[0]:
                for tuple in probas.keys() :
                    words = tuple[2:-2]
                    words = words.split("', '")
                    if ((words[-1]==sugg) and (words[ : ngram_size - 1]==twput[i-ngram_size+1:i])):
                        probs.append(probas[tuple])
                        suggestions.append((probas[(tuple)], words[-1]))
            if probs:
                probs = sorted(probs, reverse = True)
                for p in suggestions : 
                    if p[0] == probs[0] :
                        suggestion = p[1]
                        twput[i]=suggestion
            else:
                twput[i]=np.random.choice(possible_words[0])
                    
        
    
    return " ".join(twput[ngram_size-1:-1])

In [31]:
# print(correction("helo my frind ", proba_file='generated_files/part2/big_data_bigram.json'))

## III- Troisième Partie

[https://github.com/qanastek/ANTILLES/tree/main/ANTILLES](https://github.com/qanastek/ANTILLES/tree/main/ANTILLES)


#### La méthode conllu_to_csv()

In [32]:
def conllu_to_csv(conllu_file, csv_file) -> None:
    """
    This method takes a CoNLL-U format file and transform it to a csv one.
    """
    with open(conllu_file, 'r', encoding='utf-8') as conllu, open(csv_file, 'w', newline='', encoding='utf-8') as csvout:
        csv_writer = csv.writer(csvout)
        csv_writer.writerow(['FORM', 'UPOS'])  # Only write FORM and UPOS headers

        for line in conllu:
            line = line.strip()
            if line.startswith('#') or line == '':
                continue

            parts = line.split('\t')
            FORM = parts[1]  # FORM field
            upos = parts[3]  # UPOS field

            csv_writer.writerow([FORM,upos])

In [33]:
conllu_to_csv('data/part3/dev.conllu', 'data/part3/dev.csv')
conllu_to_csv('data/part3/train.conllu', 'data/part3/train.csv')
conllu_to_csv('data/part3/test.conllu', 'data/part3/test.csv')


#### La méthode transition_probability()

Cette méthode sert à construire la matrice de transition du modèle de POS

In [34]:
def transition_probability(train_file):
    df = pd.read_csv(train_file)
    unique_upos_values = df['UPOS'].unique()
    unique_upos_values = np.concatenate([["les_noms_des_tags"], unique_upos_values])
    new_df = pd.DataFrame(columns=unique_upos_values)
    # matrix columns construction
    for upos_value in unique_upos_values:
        if upos_value == "les_noms_des_tags":
            new_df[upos_value] = unique_upos_values
        else:
            new_df[upos_value] = 0.0
    new_df.iloc[0, 0] = '<s>'

    # Count the number of sentences
    num_points = df['FORM'].str.count('\.').sum()

    for col in new_df.columns[1:]:  # Exclude the first column 'les_noms_des_tags'
        count = 0
        if col == df.iloc[0,1]:
            count += 1
            num_points +=1
        for i in range(len(df) - 1):
            if df['FORM'][i] == "." and df['UPOS'][i + 1] == col:
                count += 1
            
        new_df.loc[0, col] = count / num_points
    
    columns= new_df.columns[1:]
    
    for i in range(1,len(columns)+1):
        for j in range(0,len(columns)):
            count = 0
            total_occurrences = df['UPOS'].value_counts()[columns[j]]
            if total_occurrences == 0:  # Handle division by zero
                new_df.iloc[j, i] = 0

            else:
                for k in range(len(df)-1):
                    if df['UPOS'][k] == columns[j] and df['UPOS'][k+1] == columns[i-1]:
                        count += 1
                new_df.iloc[j+1, i] = count / total_occurrences
               
 
    print(new_df) 
            
    return new_df


  num_points = df['FORM'].str.count('\.').sum()


#### La méthode emission_probability()

In [36]:
def emission_probability(train_file):
  """
  Calculates emission probabilities for each word-tag pair in a sentence.

  Args:
      train_file: CSV file containing the training data.

  Returns:
      pd.DataFrame: A DataFrame holding the emission probabilities for the corpus.
  """

  df = pd.read_csv(train_file)
  unique_words = df['FORM'].unique()
  unique_word = np.concatenate([["les_noms_des_tags"], unique_words])
  unique_upos_values = df['UPOS'].unique()

  emission_df = pd.DataFrame(columns=unique_word)
  emission_df = emission_df[['les_noms_des_tags'] + list(unique_words)]
  emission_df["les_noms_des_tags"] = unique_upos_values
  emission_df.infer_objects(copy=False)
  emission_df.fillna(0, inplace=True)
  tags = {key: 0 for key in emission_df["les_noms_des_tags"].tolist()}

  for i in range(len(df)):
      tag = df['UPOS'][i]
      token = df['FORM'][i]
      tags[tag] += 1
      emission_df.loc[emission_df['les_noms_des_tags'] == tag, token] += 1
  for i, tag in enumerate(unique_upos_values):
      emission_df.iloc[i, 1:] = emission_df.iloc[i, 1:] / tags[tag]
  return emission_df

#### La méthode viterbi()

Cette méthode implémente l'algorithme de Viterbi

In [124]:
def viterbi(sentence,em_m,tr_m):
    sentence=sentence.split()
    sentence.insert(0,'<s>')
    tags=[]    
    vit_m=pd.DataFrame(columns=sentence[::])  # viterbi matrix
    # vit_m.insert(0, 'les_noms_des_tags', tr_m.columns[1:])      # inserting a column of tags 
    row = tr_m.loc[tr_m['les_noms_des_tags'] == '<s>']          # getting the raw where 'les_noms_des_tags'is '<s>' from emition matrix                        
    vit_m.insert(0, 'les_noms_des_tags', tr_m.columns[1:])      # inserting a column of tags
    vit_m['<s>']=row.values.tolist()[0][1:]                    # getting values of tags /<s>
    sentence=sentence[1:]                          # removing <s> and </s>
    tags.append('<s>')
    for j in range(1,len(sentence)+1):
        i=0
        found=False
        for c in em_m.columns:                                  # loop on emition columns that are 'words'
            if sentence[j-1]==c:
                found=True
                em=em_m[['les_noms_des_tags',sentence[j-1]]]        # getting the column of the current word from sentence
                row = tr_m.loc[tr_m['les_noms_des_tags'] ==tags[j-1] ]      # getting the row where the tag is the previos tag from sentence (already saved in tags)
                for c2 in row.columns[1:]:  
                    word_tag_proba=em.loc[em['les_noms_des_tags']==c2].values.tolist()[0][1]        # probability of current_word/tag 
                    tag_from_em=em.loc[em['les_noms_des_tags']==c2].values.tolist()[0][0]           # the tag
                    max_previous_column=max(vit_m.iloc[:,j])
                    if c2==tag_from_em:
                        new_value= row[c2].values.tolist()[0]*word_tag_proba*max_previous_column
                        row_index = np.where(vit_m['les_noms_des_tags'] == c2)[0][0]    # index of current row
                        vit_m.iloc[row_index, j+1] = new_value  # in the vit matrix we multipy the probs of current_word/tag with tag/tag
                max_value_row_index=np.where(vit_m.iloc[:,j+1] ==max(vit_m.iloc[:,j+1] ))[0][0]
                tags.append(vit_m.iloc[max_value_row_index,0])
                i+=1
                break
            
        if found==False:
            for c2 in row.columns[1:]:
                max_previous_column=max(vit_m.iloc[:,j]) 
                new_value= row[c2].values.tolist()[0]*max_previous_column
                row_index = np.where(vit_m['les_noms_des_tags'] == c2)[0][0]    # index of current row
                vit_m.iloc[row_index, j+1] = new_value  # in the vit matrix we multipy the probs of current_word/tag with tag/tag
                max_value_row_index=np.where(vit_m.iloc[:,j+1] ==max(vit_m.iloc[:,j+1] ))[0][0]
            tags.append(vit_m.iloc[max_value_row_index,0])
            i+=1
    return tags[1:]


#### La méthode test_viterbi()

Cette méthode sert à tester l'algorithme de Viterbi sur les données de validation premièrement et par quite sur les données de test

In [131]:
def test_viterbi(test_file,em_m,tr_m):
    data = pd.read_csv(test_file)
    tags=data['UPOS'].tolist()
    tags = [item for item in tags if item != 'YPFOR']
    words = data['FORM']
    words = " ".join(words).lower()
    count=0
    sentences=words.split(" . ")
    pos=[]
    for sentence in sentences:   
        pos.append(viterbi(sentence,em_m=em_m,tr_m=tr_m))
    pos=[item for sublist in pos for item in sublist]
    for i in range(len(tags)):
            if pos[i]==tags[i]:
                count+=1
    return count/len(tags)
    

In [132]:

em_m=pd.read_csv('generated_files/part3/emission_sample.csv')
tr_m=pd.read_csv('generated_files/part3/transition_sample.csv')

print(test_viterbi('data/part3/train_sample.csv',em_m=em_m,tr_m=tr_m))

0.7118126272912424


#### Veuillez noter, que le modèle a été entraîné uniquement sur les données de développement, car nous n'avons pas pu construire la matrice d'émission pour les données d'entraînement. Cela a pris beaucoup de temps. Nous avons ensuite testé le modèle sur les données de test, où il a obtenu un score de 47%. Ceci est logique car il n'a pas été entraîné sur un grand ensemble de données.