In [None]:
import spacy
import re
from spacy.tokens import Doc
import json
goalpos = json.load(open("sequoia.test.json", "r"))

# Charge the french model from spacy
nlp = spacy.load("fr_core_news_sm")

#function given in the exercice by the teacher   
def predict_pos(sentence, model):
    """
    This function modify the tokenizer included in spacy for a simpler one. It tokenizes a sentence and splits on spaces
    Args:
    sentence(str): The sentences that will be tokenized and labelled
    model (spacy.language.Language):the spacy model charged with fr_core_news_sm"
    
    Returns:
    list[str]: that cotains the predicted pos for each word

    Exemple:
     >>> predict_pos("Je mange une pomme", nlp)
        ['PRON', 'VERB', 'DET', 'NOUN']
    """
    model.tokenizer = lambda x: Doc(model.vocab, x.split())
    return [token.pos_ for token in model(sentence)]
    

print("Test function predict_pos")
sentence = "Je mange une pomme"
pos_tags = predict_pos(sentence, nlp)
print("Phrase :", sentence)
print("Tags prédits :", pos_tags)

# reponse question 3 du tp
def predict_pos_list(list_sentence, model):
    """
    This function cuts a list of sentences on different separators usisng Regex it then iterates  on  the list of sentences and creates a
    list of lists of POS
    Args:
    text [str]: a liste of sentences that will be tokenized and tagued with POS
    model (spacy.language.Language):the spacy model charged with fr_core_news_sm"

    Returns:
    list[list[str]]]: a list of list of POS 
    """
    sentences = re.split(r'[.!?]+', list_sentence)
    sentences = [s.strip() for s in sentences if s.strip() != ""]
    list_pos=[]
    for l in sentences:
        list_pos.append(predict_pos(l, model))
    return list_pos

print("Test function predict_pos_list:")
        
texte = "Je mange une pomme. Tu bois du café !"
print(predict_pos_list(texte, nlp))


def sentence_accuracy(poslist,dico):
    """Compare les étiquettes morphosyntaxiques prédites (PoS) d'une phrase à celles
    du jeu de données de référence (gold standard) et calcule la précision
    phrase par phrase.

    Compares the POS labesl of a sentence to its goldpos (rigth POS labesl) it calculates a vector  if all predict POS match the goldpos
    the vector contains only 1 and the sentence accuracy is 1 (True) if one POS is unmatched the accuracy is O (FALSE)

    Args:
        poslist (list[str]): POS labels predicted by the tagger
                             (ex : ['PRON', 'VERB', 'DET', 'NOUN']).
        dico (dict): a dictionnary that cotains tokens and their goldpos with tokens and POS as key {'tokens': [...], 'pos': [...]}.

    Returns:
        tuple:
            - vector (list[int]): binary list :
                * 1 id predict POS and goldpos matches
                * 0 if unmatched.
            - accuracy (int): 1 all predict POS match goldpos, else 0 ."""
    vector=[]
    accuracy=0
    lengthposlit=len(poslist)
    goldpos=dico.get("pos")
    lengthgoldpos=len(goldpos)
    minlength=min(lengthposlit, lengthgoldpos)
    compteur=0
    while compteur<minlength:
            if poslist[compteur]==goldpos[compteur]:
                vector.append(1)
                compteur=compteur+1
            else:
                vector.append(0)
                compteur=compteur+1
    if lengthposlit>lengthgoldpos:# adds 0 to the vector if poslit longueur than dico
        for i  in range(compteur,lengthposlit):
            vector.append(0)
    if sum(vector)==lengthgoldpos:
        accuracy=1
    else:
        accuracy=0
    return vector, accuracy
print("Test function  sentence_accuracy:")
pos_pred = ['PRON', 'VERB', 'ADV']
gold_data = {'tokens': ['cela', 'signifie', 'que'],'pos': ['PRON', 'VERB', 'SCONJ']}
print(sentence_accuracy(pos_pred, gold_data))

def micro_word_accuracy(poslist,dico):
    """ Computes the number of correctly predicted PoS (Part-of-Speech) tags 
    for a given sentence (micro-level accuracy).

    This function uses the binary vector returned by `sentence_accuracy()` 
    to count how many individual word-level tags were correctly predicted.

    Parameters
    ----------
    poslist : list[str]
        The list of predicted PoS tags (e.g., ['PRON', 'VERB', 'DET', 'NOUN']).

    dico : dict
        A dictionary containing the reference (gold standard) sentence and its 
        corresponding PoS tags. Must have the structure:
        {
            "tokens": [...],
            "pos": [...]
        }

    Returns
    -------
    int
        The number of correctly predicted PoS tags in the sentence.

    Example
    -------
    >>> pos_pred = ['PRON', 'VERB', 'ADV']
    >>> gold_data = {'tokens': ['cela', 'signifie', 'que'],
    ...              'pos': ['PRON', 'VERB', 'SCONJ']}
    >>> micro_word_accuracy(pos_pred, gold_data)
    2

    Explanation
    -----------
    Out of the 3 tags, 2 are correct ('PRON' and 'VERB'), 
    so the function returns 2.
     """
    vector= sentence_accuracy(poslist,dico)[0]
    word_accuracy=sum(vector)
    return int(word_accuracy)  


def macro_word_accuracy(poslist,dico,dico_poslabels):
    """
    Computes the number of correctly predicted PoS tags per category (macro-level accuracy).

    This function updates a dictionary (`dico_poslabels`) that tracks, 
    for each PoS tag (e.g., 'VERB', 'NOUN', etc.), how many times that 
    tag was correctly predicted across all sentences.

    Parameters
    ----------
    poslist : list[str]
        The list of predicted PoS tags for one sentence.

    dico : dict
        A dictionary containing the reference sentence and its PoS tags:
        {
            "tokens": [...],
            "pos": [...]
        }

    dico_poslabels : dict
        A cumulative dictionary that keeps count of correctly predicted 
        tags per PoS label. It is updated in-place and returned.

    Returns
    -------
    dict
        The updated dictionary containing the number of correct predictions per PoS label.
           Example
    -------
    >>> pos_pred = ['PRON', 'VERB', 'ADV']
    >>> gold_data = {'tokens': ['cela', 'signifie', 'que'],
    ...              'pos': ['PRON', 'VERB', 'SCONJ']}
    >>> macro_word_accuracy(pos_pred, gold_data, {})
    {'PRON': 1, 'VERB': 1, 'SCONJ': 0}

    Explanation
    -----------
    - 'PRON' is correct → count = 1  
    - 'VERB' is correct → count = 1  
    - 'SCONJ' was mispredicted → count = 0"""
    vector= sentence_accuracy(poslist,dico)[0]
    goldpos=dico.get("pos")
    lengthgoldpos=len(goldpos)
    lenvector=len(vector)
    lenmin=min(lenvector,lengthgoldpos)
    compteur=0
    while compteur<lenmin:
        if goldpos[compteur] in dico_poslabels and vector[compteur]==1:#if alreaddy in dico that adds 1
                dico_poslabels[goldpos[compteur]]=dico_poslabels[goldpos[compteur]]+1
        if goldpos[compteur] not in dico_poslabels and vector[compteur]==1:#if not in dico creates a key
            dico_poslabels[goldpos[compteur]]=1
        if goldpos[compteur] not in dico_poslabels and vector[compteur]==0:#if not in dico creates a key creates a key with value 0
            dico_poslabels[goldpos[compteur]]=0             
        compteur=compteur+1
    #add pos labels that have not been predicted at all
    for compteur  in range(compteur,lengthgoldpos):
        if goldpos[compteur] not in dico_poslabels:
            dico_poslabels[goldpos[compteur]]=0  
    
    return dico_poslabels
        

def postagger_accuracy(poslist_list, dico):
    """
    Evaluate the overall performance of a Part-of-Speech (PoS) tagger on a corpus.

    This function computes:
    - Sentence-level accuracy (how many entire sentences were perfectly tagged)
    - Word-level accuracy (percentage of correctly predicted PoS tags)
    - Per-label accuracy (how many times each PoS label was correctly predicted)

    It relies on the helper functions:
    `sentence_accuracy()`, `micro_word_accuracy()`, and `macro_word_accuracy()`.

    Args
    ----------
    poslist_list : list[list[str]]
        A list of lists of predicted PoS tags for each sentence in the corpus.

    dico : list[dict]
        The gold-standard corpus containing dictionaries of tokens and their
        reference PoS tags, for example:
        [
            {'tokens': ['Je', 'mange', 'une', 'pomme'], 'pos': ['PRON', 'VERB', 'DET', 'NOUN']},
            {'tokens': ['Tu', 'cours'], 'pos': ['PRON', 'VERB']}
        ]

    Returns
    -------
    tuple
        (sentence_accuracy_count, word_accuracy_percentage, per_label_accuracy_dict)
        where:
        - sentence_accuracy_count (int): number of perfectly predicted sentences
        - word_accuracy_percentage (float): percentage of correct PoS tags across all sentences
        - per_label_accuracy_dict (dict): number of correct predictions per PoS tag (e.g., {'VERB': 40, 'NOUN': 30, ...})

    Example
    -------
    >>> pos_pred = [
    ...     ['PRON', 'VERB', 'DET', 'NOUN'],
    ...     ['PRON', 'AUX', 'VERB', 'NOUN']
    ... ]
    >>> gold_data = [
    ...     {'tokens': ['Je', 'mange', 'une', 'pomme'], 'pos': ['PRON', 'VERB', 'DET', 'NOUN']},
    ...     {'tokens': ['Tu', 'as', 'mangé', 'pain'], 'pos': ['PRON', 'AUX', 'VERB', 'NOUN']}
    ... ]
    >>> postagger_accuracy(pos_pred, gold_data)
    (2, 100.0, {'PRON': 2, 'VERB': 2, 'DET': 1, 'NOUN': 2, 'AUX': 1})
    
    Explanation
    -----------
    Both sentences were perfectly tagged → 2 correct sentences (100% accuracy).
    The per-label dictionary shows how many PoS tags were correctly predicted in total.
    """
    sentence=0
    proportion=0
    lengthtexte=0
    correctpos={}
    
    for i in range(len(poslist_list)):
        sentence=sentence+sentence_accuracy(poslist_list[i],dico[i])[1]
        proportion= proportion+micro_word_accuracy(poslist_list[i],dico[i])
        goldpos=dico[i].get("pos")
        lengthgoldpos=len(goldpos)
        lengthtexte=lengthtexte+lengthgoldpos
        correctpos=macro_word_accuracy(poslist_list[i],dico[i],correctpos)#update the correctpos by adding new keys if necessary or updating the key value
    proportion=(proportion/lengthtexte)*100
    return sentence,proportion,correctpos
"""The Three following lines aren't usefull they are here to test postagger_accuracy quickly on the following arguments """
pos_pred = [['PRON', 'VERB', 'DET', 'NOUN'],['PRON', 'AUX', 'VERB', 'NOUN']]
gold_data = [ {'tokens': ['Je', 'mange', 'une', 'pomme'], 'pos': ['PRON', 'VERB', 'DET', 'NOUN']},{'tokens': ['Tu', 'as', 'mangé', 'pain'], 'pos': ['PRON', 'AUX', 'VERB', 'NOUN']}]
#postagger_accuracy(pos_pred, gold_data))   
        
def word_error_rate(poslist_list, dico):
    """
    Compute the Word Error Rate (WER) for a PoS tagger on a corpus.

    This metric measures the percentage of incorrectly predicted PoS tags
    relative to the total number of tags in the gold-standard data.

    Parameters
    ----------
    poslist_list : list[list[str]]
        A list of lists of predicted PoS tags for each sentence.

    dico : list[dict]
        The gold-standard corpus containing dictionaries with tokens and their true PoS tags.

    Returns
    -------
    int
        The Word Error Rate (WER) expressed as a percentage of incorrect PoS predictions.

    Example
    -------
    >>> pos_pred = [
    ...     ['PRON', 'VERB', 'DET', 'NOUN'],
    ...     ['PRON', 'AUX', 'VERB', 'NOUN']
    ... ]
    >>> gold_data = [
    ...     {'tokens': ['Je', 'mange', 'une', 'pomme'], 'pos': ['PRON', 'VERB', 'DET', 'NOUN']},
    ...     {'tokens': ['Tu', 'as', 'mangé', 'pain'], 'pos': ['PRON', 'AUX', 'VERB', 'NOUN']}
    ... ]
    >>> word_error_rate(pos_pred, gold_data)
    0

    Explanation
    -----------
    All PoS tags are correct → 0% error rate.
    If one tag had been wrong (e.g. VERB predicted as ADJ), 
    the function would return a non-zero WER.
    """
    word_accurate=0
    lengthtexte=0
    for i in range(len(poslist_list)):
        word_accurate=word_accurate+micro_word_accuracy(poslist_list[i],dico[i])
        goldpos=dico[i].get("pos")
        lengthgoldpos=len(goldpos)
        lengthtexte=lengthtexte+lengthgoldpos
    word_error_rate_value=((lengthtexte-word_accurate)/lengthtexte)*100
    return (int(word_error_rate_value))

def corpus_stats(filepath):
    """
    Extract and return all sentences (tokens only) from a JSON corpus as a single string.

    The function reads a JSON file containing a list of sentence dictionaries,
    each with a 'tokens' key. It concatenates all tokens into complete sentences,
    joins them with spaces, and finally returns a string containing the entire corpus.

    Args:
        filepath (str): Path to the JSON corpus file.
                        The file should contain a list of dictionaries with a 'tokens' key.

    Returns:
        str: A single string containing all sentences from the corpus,
             with tokens joined by spaces.

    Example:
        >>> text = corpus_tokens_only("data/sequoia_sample.json")
        >>> print(text[:100])
        'Je mange une pomme . Tu adores le chocolat . Il boit du café ...'
    """
    with open(filepath, "r", encoding="utf-8") as f:
        corpus = json.load(f)

    # Nombre de lignes = nombre d'entrées dans la liste
    nb_lignes = len(corpus)
    nb_mots=0 # initialize varaible at 0
    listkey=[]
    i=0
    while i<len(corpus):# this whole loop must be changed because it is not straigt to the point enough
        for k in corpus[i]:
            listemots=corpus[i][k]
            nb_mots=nb_mots+len(listemots)# not efficient becuase count pos and it is useless but works
        i=i+1
    nb_mots=int(nb_mots/2) # divides by 2 to retrieve the Pos labels of the number of words
    
    print(f"Nombre de phrases : {nb_lignes}")
    print(f"Nombre total de mots : {nb_mots}")

    return nb_lignes, nb_mots

def corpus_tokens_only(filepath):
    """
    Extract and return all sentences (tokens only) from a JSON corpus as a single string.

    The function reads a JSON file containing a list of sentence dictionaries,
    each with a 'tokens' key. It concatenates all tokens into complete sentences,
    joins them with spaces, and finally returns a string containing the entire corpus.

    Args:
        filepath (str): Path to the JSON corpus file.
                        The file should contain a list of dictionaries with a 'tokens' key.

    Returns:
        str: A single string containing all sentences from the corpus,
             with tokens joined by spaces.

    Example:
        >>> text = corpus_tokens_only("data/sequoia_sample.json")
        >>> print(text[:100])
        'Je mange une pomme . Tu adores le chocolat . Il boit du café ...'
        """
    with open(filepath, "r", encoding="utf-8") as f:
        corpus = json.load(f)
    liste_phrase=[]
    i=0
    while i<len(corpus):# this whole loop must be changed
        for k in corpus[i]:
            if k=="tokens":
                liste_phrase.append(" ".join(corpus[i][k]))
        i=i+1
    str_corpus="".join(liste_phrase)
    return str_corpus
       
                
corpusb=corpus_tokens_only("sequoia.test.json")# list of tokenize sentences from the sequoia file
lpos=predict_pos_list(corpusb, nlp)
print("Test de WER sur le courpus Sequoia:")
print(word_error_rate(lpos, goalpos))

Test function predict_pos
Phrase : Je mange une pomme
Tags prédits : ['PRON', 'VERB', 'DET', 'NOUN']
Test function predict_pos_list:
[['PRON', 'VERB', 'DET', 'NOUN'], ['DET', 'NOUN', 'ADP', 'NOUN']]
Test function  sentence_accuracy:
([1, 1, 0], 0)
 test de WER sur le courpus sequoia:
77
