In [1]:
def compare_date_components(day_doc, month_doc, year_doc, day_req, month_req, year_req):
    """Compare two date components, handling None values in the request."""
    # Convert all to int for doc, but req can be None
    if year_req is not None:
        if int(year_doc) < int(year_req):
            return -1
        elif int(year_doc) > int(year_req):
            return 1
    if month_req is not None:
        if int(month_doc) < int(month_req):
            return -1
        elif int(month_doc) > int(month_req):
            return 1
    if day_req is not None:
        if int(day_doc) < int(day_req):
            return -1
        elif int(day_doc) > int(day_req):
            return 1
    return 0

def compare_dates(date_doc, date_requete):
    """
    Compare une date de document à une requête de date, en tenant compte des champs manquants (None).
    """
    # Split the document date into components
    day_doc, month_doc, year_doc = date_doc.split('/')
    day_doc, month_doc, year_doc = int(day_doc), int(month_doc), int(year_doc)

    # If no restriction, always True
    if all(date_requete.get(k) is None for k in ["début", "fin", "précis", "not"]):
        return True

    # Handle "not"
    not_req = date_requete.get("not")
    if not_req is not None:
        jn, mn, an = not_req.get("j"), not_req.get("m"), not_req.get("a")
        match = True
        if an is not None and int(year_doc) != int(an):
            match = False
        if mn is not None and int(month_doc) != int(mn):
            match = False
        if jn is not None and int(day_doc) != int(jn):
            match = False
        if match:
            return False

    # Handle "précis"
    precis = date_requete.get("précis")
    if precis is not None:
        jp, mp, ap = precis.get("j"), precis.get("m"), precis.get("a")
        match = True
        if ap is not None and int(year_doc) != int(ap):
            match = False
        if mp is not None and int(month_doc) != int(mp):
            match = False
        if jp is not None and int(day_doc) != int(jp):
            match = False
        return match

    res = True
    # Handle "début"
    debut = date_requete.get("début")
    if debut is not None:
        jd, md, ad = debut.get("j"), debut.get("m"), debut.get("a")
        cmp = compare_date_components(day_doc, month_doc, year_doc,
                                      int(jd) if jd is not None else None,
                                      int(md) if md is not None else None,
                                      int(ad) if ad is not None else None)
        if cmp < 0:
            res = False

    # Handle "fin"
    fin = date_requete.get("fin")
    if fin is not None:
        jf, mf, af = fin.get("j"), fin.get("m"), fin.get("a")
        cmp = compare_date_components(day_doc, month_doc, year_doc,
                                      int(jf) if jf is not None else None,
                                      int(mf) if mf is not None else None,
                                      int(af) if af is not None else None)
        if cmp > 0:
            res = False

    return res

In [32]:
import pandas as pd
import ast
#rubrique et dates dans others.csv
# mots_clés dans text_title.csv

'''
    resultats = {
        "return": None,
        "mots_cles": {"yes": [], "no": None},
        "operateurs_mots_cles": None,
        "rubrique": None,
        "operateurs_rubrique": None,
        "dates": {"debut": None, "fin": None, "précis": None, "not": None},
        "titre": None,
        "operateurs_titre": None,
        "images": None
    }

'''
def recherche_documents(resultats, index_inverse_texte, index_inverse_date, index_inverse_rubrique, index_inverse_titre, index_inverse_image):
    """
    Fonction qui recherche les documents correspondant, dans lemmes_path, aux résultats de la requete (resultats).
    :param resultats: dictonnaires contenant les résultats de la requête
    :param index_inverse_mots_cles: fichier csv index inversé des mots-clés
    :param index_inverse_dates_rubriques: fichier csv index inversé des dates et rubriques
    :return: Liste de documents correspondant aux résultats de la requête
    """
    docs_cherches = []

    #partie mots-clés (sans les not)
    mots_cles = resultats["mots_cles"]
    if len(mots_cles["yes"]) != 0:
        dict_match_key = {}
        index_texte = pd.read_csv(index_inverse_texte, sep= "\t")
        for mot in mots_cles["yes"]:
            print("mot:", mot)
            # Si le mot contient un espace, on split pour traiter séparément chaque sous-mot
            sous_mots = mot.split() if " " in mot else [mot]
            
            docs_qui_matchs = index_texte.loc[index_texte["mot"] == mot, "docs"].values
            if len(docs_qui_matchs) == 0:
                continue
            else:
                docs_qui_matchs = docs_qui_matchs[0]
                dict_match_key[mot] = docs_qui_matchs
            #print(f"docs_qui_matchs: {docs_qui_matchs}")
        
            
        if len(dict_match_key.keys())== 1:
            docs_key = list(set(e.strip() for e in ast.literal_eval(list(dict_match_key.values())[0])))
        else:
            if resultats["operateurs_mots_cles"] == "ou":
                '''
                for l in dict_match_key.values():
                    #print("l:", l)
                    #print(e.strip() for e in ast.literal_eval(l))
                '''
                docs_key = list(set().union(*(set(e.strip() for e in ast.literal_eval(l)) for l in dict_match_key.values())))
            else:
                '''
                for l in dict_match_key.values():
                    #print("l:", l)
                '''
                docs_key = list(set().intersection(*(set(e.strip() for e in ast.literal_eval(l)) for l in dict_match_key.values())))

        print("docs_key:", docs_key)
        if len(docs_cherches) == 0:
            docs_cherches = docs_key
        else:
            docs_cherches = list(set(e.strip() for e in docs_cherches) & set(e.strip() for e in docs_key))

    print("doc_cherches post mots_clés:", docs_cherches)
    #partie titre
    titres = resultats["titre"]
    if titres != None:
        dict_match_titre = {}
        index_titre = pd.read_csv(index_inverse_titre, sep='\t')
        if not isinstance(titres, list):
            titres = [titres]
        for titre in titres:
            for tr in [" ", '"']:
                while tr in titre:
                    titre = titre.replace(tr, '')
            docs_qui_matchs = index_titre.loc[index_titre["mot"] == titre, "docs"].values[0]
            dict_match_titre[titre] = docs_qui_matchs

        if len(dict_match_titre.keys()) == 1:
            docs_titre = list(set(e.strip() for e in ast.literal_eval(list(dict_match_titre.values())[0])))
        else:
            if resultats["operateurs_titre"] == "ou":
                docs_titre = list(set().union(*(set(e.strip() for e in ast.literal_eval(l)) for l in list(dict_match_titre.values()))))

            else:
                docs_titre = list(set.intersection(*(set(e.strip() for e in ast.literal_eval(l)) for l in dict_match_titre.values())))

        
        if len(docs_cherches) == 0:
            docs_cherches = docs_titre
        else:
            docs_cherches = list(set(e.strip() for e in docs_cherches) & set(e.strip() for e in docs_titre))
    print("doc_cherches post titre:", docs_cherches)

    #partie rubrique
    rubriques = resultats["rubrique"]
    if rubriques != None:
        print(rubriques)
        dict_match_rubrique = {}
        index_rubrique = pd.read_csv(index_inverse_rubrique, sep='\t')
        if not isinstance(rubriques, list):
            liste = [rubriques.strip()]
            rubriques = liste
        for rubrique in rubriques:
            rubrique = rubrique.strip()
            docs_matchs = index_rubrique.loc[index_rubrique["mot"] == rubrique, "docs"].values
            if len(docs_matchs) > 0:
                docs_qui_matchs = docs_matchs[0]
                dict_match_rubrique[rubrique] = docs_qui_matchs
            else:
                print(f"⚠️ Rubrique introuvable : {rubrique}")
            
        if len(dict_match_rubrique.keys()) == 1:
            docs_rubrique = list(set(e.strip() for e in ast.literal_eval(list(dict_match_rubrique.values())[0])))
        else:
            #dict_match_rubrique[rubrique] = docs_qui_matchs
            if resultats["operateurs_rubrique"] == "ou":
                docs_rubrique = list(set().union(*(set(e.strip() for e in ast.literal_eval(l)) for l in list(dict_match_rubrique.values()))))
            else:
                docs_rubrique = list(set().intersection(*(set(e.strip() for e in ast.literal_eval(l)) for l in list(dict_match_rubrique.values()))))
        print("docs_rubrique:", docs_rubrique)
        if len(docs_cherches) == 0:
            docs_cherches = docs_rubrique
        else:
            docs_cherches = list(set(docs_cherches) & set(docs_rubrique))
    print("doc_cherches post rubrique:", docs_cherches)

    #partie images
    image = resultats["images"]
    if image != None:
        index_image = pd.read_csv(index_inverse_image, sep='\t')
        docs_match_image = index_image.loc[index_image["mot"] == "yes", "docs"].values[0]
        if len(docs_cherches) == 0:
            docs_cherches = docs_match_image
        else:
            docs_cherches = list(set(docs_cherches) & set(docs_match_image))
    print("doc_cherches post images:", docs_cherches)

    #partie date
    date = resultats["dates"]
    if date["début"] != None or date["fin"] != None or date["précis"] != None or date["not"] != None:
        index_date = pd.read_csv(index_inverse_date, sep='\t')
        docs_dates = []
        for _, row in index_date.iterrows():
            #print("dates ____________________")
            date_doc_comp = row["mot"]
            
            if compare_dates(date_doc=date_doc_comp, date_requete= date):
                docs_qui_matchs = ast.literal_eval(row["docs"])
                docs_dates = []
                if len(docs_dates) == 0:
                    docs_dates = docs_qui_matchs 
                else:
                    docs_dates = list(set(e.strip() for e in docs_dates) | set(e.strip() for e in docs_qui_matchs))
                #print(docs_dates)
        if docs_dates:
            if len(docs_cherches) == 0:
                docs_cherches = docs_dates
            else:
                docs_cherches = list(set(e.strip() for e in docs_cherches) & set(e.strip() for e in docs_dates))
    print("doc_cherches post dates:", docs_cherches)
    
    #partie not
    mot_cles_not = mots_cles["no"]
    if mot_cles_not != None:
        index_texte = pd.read_csv(index_inverse_texte, sep= "\t")
        docs_qui_matchs = index_texte.loc[index_texte["mot"] == mot_cles_not, "docs"].values[0]
        
        docs_cherches = list(set(e.strip() for e in docs_cherches) - set(e.strip() for e in docs_qui_matchs))

    print("doc_cherches post not:", docs_cherches)

    #partie return_value
    if resultats["return"] != "rubriques":
        return docs_cherches
    else:
        rubriques_cherches = []
        index_rubrique = pd.read_csv(index_inverse_rubrique, sep='\t')
        for _, rubrique in index_rubrique.iterrows():
            for doc in docs_cherches:
                if doc in ast.literal_eval(rubrique["docs"]):
                    rubriques_cherches.append(rubrique["mot"])
        return set(rubriques_cherches)


In [33]:
from td5 import *
from td6 import *

if __name__ == "__main__":
    # extraction des informations de la requete
    requete = input("Entrez votre requête en langage naturel : ")
    print("Requête initiale :", requete)
    print("---------------------------------------------")

    resultat = traiter_requete(requete)

    # correction orthographique des mots clés
    lemmes_path = "lemmes_lower.csv"
    if resultat["mots_cles"]["no"] != None:
        print("mots b4 correction: ",resultat["mots_cles"]["no"])
        resultat["mots_cles"]["no"] = correction_orthographique(resultat["mots_cles"]["no"], lemmes_path)
    
    for i in range(len(resultat["mots_cles"]["yes"])):
        print("mots b4 correction: ", resultat["mots_cles"]["yes"][i])
        resultat["mots_cles"]["yes"][i] = correction_orthographique(resultat["mots_cles"]["yes"][i], lemmes_path)
    
    print("Requête corrigée :")
    print(resultat)
    print("-------------------------------------------")

    index_inverse_texte = "../TD4/reverse_index_texte.csv"
    index_inverse_date = "../TD4/reverse_index_date.csv"
    index_inverse_rubrique = "../TD4/reverse_index_rubrique.csv"
    index_inverse_titre = "../TD4/reverse_index_titre.csv"
    index_inverse_image = "../TD4/reverse_index_image.csv"

    # recherche des documents pertinents
    documents = recherche_documents(resultat, index_inverse_texte, index_inverse_date, index_inverse_rubrique, index_inverse_titre, index_inverse_image)
    print(documents)


Requête initiale : Je voudrais les articles qui parlent d’airbus ou du projet Taxibot.
---------------------------------------------
mots b4 correction:   airbus 
mots b4 correction:  projet taxibot
Requête corrigée :
{'return': 'articles', 'mots_cles': {'yes': ['airbus', 'permettre'], 'no': None}, 'operateurs_mots_cles': 'ou', 'rubrique': None, 'operateurs_rubrique': None, 'dates': {'début': None, 'fin': None, 'précis': None, 'not': None}, 'titre': None, 'operateurs_titre': None, 'images': None}
-------------------------------------------
mot: airbus
mot: permettre
docs_key: ['72394', '72634', '76207', '75065', '73190', '68390', '69534', '75461', '72635', '73691', '67941', '67555', '73880', '70749', '76206', '72396', '68273', '70744', '73185', '72113', '73434', '73875', '67553', '68882', '67068', '70161', '75071', '70914', '74455', '75067', '69811', '72117', '71841', '73186', '75796', '69813', '67554', '72395', '75789', '75063', '72934', '76510', '71361', '72631', '71360', '71357', '6