# ***Projetc Hotel*** 

In [None]:
import xml.etree.ElementTree as ET
from collections import defaultdict
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer



# ***Collection KSA***

In [2]:
import re
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Activer ou désactiver l'arrêt des mots et le stemming
stop_words_enabled = True
stemming_enabled = True

# Charger les listes d'arrêt des mots pour différentes langues
stop_words = set(stopwords.words("english"))

# Initialiser le stemmer Snowball pour l'anglais
stemmer = SnowballStemmer("english")

# Charger le contenu du fichier texte
input_file_path = "KSA_DATA4.txt"  # Remplacez par le chemin correct vers votre fichier texte

# Initialiser un dictionnaire pour l'index inversé
inverted_index = defaultdict(list)

# Lire le fichier ligne par ligne
with open(input_file_path, "r", encoding="utf-8") as file:
    for line in file:
        # Séparer la ligne en ses composants en utilisant la première virgule pour l'ID
        parts = line.strip().split(",", 1)  # Séparer uniquement après la première virgule
        
        if len(parts) < 2:  # S'assurer qu'il y a bien au moins un ID et une chaîne pour le nom
            print(f"Line skipped due to incorrect format: {line.strip()}")
            continue  # Ignorer les lignes mal formées
        
        doc_id = parts[0]  # Première partie est l'ID
        remaining_text = parts[1]  # Le reste est le nom, la ville et le type de chambre
        
        # Séparer le reste du texte en "name", "room_type", "customer_reviews" avec une expression régulière plus robuste
        remaining_parts = re.split(r',\s*', remaining_text)
        
        if len(remaining_parts) < 3:  # Vérifier qu'il y a bien 3 éléments restants
            print(f"Line skipped due to missing components: {line.strip()}")
            continue  # Si non, ignorer la ligne
        
        # Gestion dynamique du nombre de composants
        if len(remaining_parts) > 3:
            name = remaining_parts[0]
            room_type = remaining_parts[1]
            customer_reviews = ", ".join(remaining_parts[2:])  # Joindre les éléments supplémentaires en un seul champ
        else:
            name, room_type, customer_reviews = remaining_parts

        # Combiner le contenu de la propriété pour traitement
        document_content = f"{name} {room_type} {customer_reviews}"

        # Prétraitement du texte : suppression de la ponctuation, conversion en minuscules, etc.
        document_content_cleaned = re.sub(r'[^\w\s]', '', document_content)  # Pour conserver la version nettoyée
        document_content_cleaned = document_content_cleaned.lower()

        # Tokenisation du texte brut (d'origine) et du texte nettoyé
        original_tokens = document_content.split()  # Liste des mots dans le texte d'origine
        cleaned_tokens = document_content_cleaned.split()  # Liste des mots nettoyés
        print("Original tokens:", original_tokens)
        print("Cleaned tokens:", cleaned_tokens)

        # Arrêt des mots
        if stop_words_enabled:
            cleaned_tokens = [word for word in cleaned_tokens if word not in stop_words]

        # Stemming
        if stemming_enabled:
            cleaned_tokens = [stemmer.stem(word) for word in cleaned_tokens]
            print("Cleaned tokens après stemming:", cleaned_tokens)
            print("================================================================================================================")

        # Créer l'index inversé en utilisant les positions dans le texte d'origine
        for position, original_token in enumerate(original_tokens, start=1):
            stemmed_original_token = stemmer.stem(original_token.lower())  # Appliquer le stemming sur le token original
            
            # Ajouter la position de chaque occurrence du token dans l'index inversé
            for cleaned_token in cleaned_tokens:
                if stemmed_original_token == cleaned_token:
                    inverted_index[cleaned_token].append((doc_id, position))

# Écrire le résultat dans un fichier texte
output_file_path = "inverted_indexTF_filtered.txt"
with open(output_file_path, "w", encoding="utf-8") as output_file:
    for term, positions in inverted_index.items():
        # Utilisation d'un set pour éviter les doublons dans les positions
        unique_positions = list(set(positions))
        output_file.write(f"{term}, {unique_positions}\n")

print(f"Résultats sauvegardés dans {output_file_path}")


Original tokens: ['Copper', 'Crown', 'Furnished', 'Apartments', 'Deluxe', 'Room', '(2', 'Adults', '+', '1', 'Child)', '169', 'reviews']
Cleaned tokens: ['copper', 'crown', 'furnished', 'apartments', 'deluxe', 'room', '2', 'adults', '1', 'child', '169', 'reviews']
Cleaned tokens après stemming: ['copper', 'crown', 'furnish', 'apart', 'delux', 'room', '2', 'adult', '1', 'child', '169', 'review']
Original tokens: ['Four', 'Points', 'by', 'Sheraton', 'Makkah', 'Al', 'Naseem', 'Superior', 'Twin', 'Room', '9418', 'reviews']
Cleaned tokens: ['four', 'points', 'by', 'sheraton', 'makkah', 'al', 'naseem', 'superior', 'twin', 'room', '9418', 'reviews']
Cleaned tokens après stemming: ['four', 'point', 'sheraton', 'makkah', 'al', 'naseem', 'superior', 'twin', 'room', '9418', 'review']
Original tokens: ['Corp', 'Inn', 'Deira.', 'Grand', 'Room', '397', 'reviews']
Cleaned tokens: ['corp', 'inn', 'deira', 'grand', 'room', '397', 'reviews']
Cleaned tokens après stemming: ['corp', 'inn', 'deira', 'grand'

___
# ***Partie 2 : RUNNING SEARCH ON INDEX***
___

## ***1. Chargement de l'index depuis un fichier***

In [5]:
from collections import defaultdict

def load_index_from_file(file_path):
    """
    Charge l'index positionnel depuis un fichier texte et le structure correctement
    pour une utilisation dans les fonctions de recherche.
    """
    positional_index = defaultdict(lambda: {"postings": defaultdict(list)})

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Diviser la ligne en terme et postings
                parts = line.split(', ', 1)  # Diviser une seule fois après la première virgule
                # if len(parts) != 2:
                #     print(f"Ligne ignorée (format incorrect) : {line.strip()}")
                #     continue

                term = parts[0].strip()  # Le terme
                postings_raw = parts[1].strip().strip("[]\n").split('), ')  # Traiter les postings
                
                for post in postings_raw:
                    if post:
                        try:
                            doc_id, position = post.strip("()").split(', ')
                            positional_index[term]["postings"][doc_id].append(int(position))
                        except ValueError:
                            print(f"Post ignoré (format incorrect) : {post}")
                            continue
    except FileNotFoundError:
        print(f"Le fichier {file_path} est introuvable.")
    except Exception as e:
        print(f"Une erreur est survenue : {e}")

    return positional_index


index = load_index_from_file('inverted_indexTF_filtered.txt')

for term, postings in index.items():
    print(f"{term}: {postings}\n")
    print("=======================================================================================================================")


copper: {'postings': defaultdict(<class 'list'>, {"'0'": [1]})}

crown: {'postings': defaultdict(<class 'list'>, {"'139'": [1], "'121'": [1], "'0'": [2], "'12'": [1], "'355'": [2], "'116'": [1], "'27'": [1], "'164'": [1]})}

furnish: {'postings': defaultdict(<class 'list'>, {"'780'": [2], "'802'": [4], "'730'": [2], "'232'": [3], "'614'": [3], "'526'": [3], "'740'": [2], "'502'": [3], "'430'": [3], "'793'": [3], "'611'": [4], "'662'": [3], "'640'": [2], "'594'": [4], "'300'": [4], "'705'": [2], "'386'": [4], "'684'": [2], "'694'": [3], "'663'": [3], "'544'": [6], "'500'": [3], "'498'": [3], "'89'": [3], "'455'": [3], "'219'": [4], "'718'": [3], "'379'": [1], "'511'": [3], "'792'": [3], "'643'": [3], "'543'": [6], "'494'": [3], "'627'": [4], "'686'": [4], "'729'": [4], "'236'": [6], "'709'": [3], "'743'": [4], "'748'": [4], "'690'": [3], "'569'": [4], "'803'": [3], "'414'": [2], "'437'": [3], "'689'": [3], "'410'": [4], "'615'": [3], "'565'": [3], "'752'": [2], "'486'": [5], "'619'": [4

# ***2. Recherche booléenne avec les opérateurs AND, OR, AND NOT , NOT Sur la collection 1***

In [11]:
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re

# Initialisation des stop words et du stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_query(query):
    """
    Prétraite une requête en supprimant les stop words et en appliquant le stemming.
    """
    terms = query.lower().split()
    filtered_terms = [stemmer.stem(term) for term in terms if term not in stop_words]
    return filtered_terms

def boolean_search_and(terms, index):
    """
    Recherche booléenne avec l'opérateur AND.
    """
    if not terms:
        return set()
    result_docs = set(index.get(terms[0], {}).get('postings', {}).keys())
    for term in terms[1:]:
        result_docs.intersection_update(index.get(term, {}).get('postings', {}).keys())
    return result_docs

def boolean_search_or(terms, index):
    """
    Recherche booléenne avec l'opérateur OR.
    """
    result_docs = set()
    for term in terms:
        result_docs.update(index.get(term, {}).get('postings', {}).keys())
    return result_docs

def boolean_search_not(all_documents, terms, index):
    """
    Recherche booléenne avec l'opérateur NOT.
    """
    result_docs = set(all_documents)
    for term in terms:
        result_docs.difference_update(index.get(term, {}).get('postings', {}).keys())
    return result_docs

def evaluate_boolean_query(query, index, all_documents):
    """
    Évalue une requête booléenne complexe en respectant les priorités des opérateurs.
    """
    # Prétraiter la requête pour séparer les opérateurs et les termes
    query = query.lower()
    tokens = re.findall(r'\w+|and|or|not|\(|\)', query)
    
    # Appliquer le stemming sur les termes, ignorer les opérateurs et parenthèses
    processed_tokens = []
    for token in tokens:
        if token in {"and", "or", "not", "(", ")"}:
            processed_tokens.append(token)
        elif token not in stop_words:
            processed_tokens.append(stemmer.stem(token))

    # Utiliser une pile pour gérer les priorités et parenthèses
    def evaluate_stack(operands, operators):
        op = operators.pop()
        if op == "and":
            right = operands.pop()
            left = operands.pop()
            operands.append(left & right)
        elif op == "or":
            right = operands.pop()
            left = operands.pop()
            operands.append(left | right)
        elif op == "not":
            right = operands.pop()
            operands.append(all_documents - right)

    operands = []
    operators = []
    precedence = {"not": 3, "and": 2, "or": 1}

    for token in processed_tokens:
        if token == "(":
            operators.append(token)
        elif token == ")":
            while operators and operators[-1] != "(":
                evaluate_stack(operands, operators)
            operators.pop()  # Enlever '('
        elif token in precedence:
            while (operators and operators[-1] != "(" and
                   precedence[operators[-1]] >= precedence[token]):
                evaluate_stack(operands, operators)
            operators.append(token)
        else:
            # Ajouter les documents correspondants au terme dans la pile des opérandes
            operands.append(set(index.get(token, {}).get('postings', {}).keys()))

    # Évaluer le reste des opérateurs
    while operators:
        evaluate_stack(operands, operators)

    return sorted(operands.pop() if operands else [])

def load_index_from_file(file_path):
    """
    Charge un index inversé à partir d'un fichier.
    """
    positional_index = defaultdict(lambda: {"postings": defaultdict(list)})
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.split(', ', 1)
                if len(parts) != 2:
                    continue
                term = parts[0].strip()
                postings_raw = parts[1].strip().strip("[]\n").split('), ')
                for post in postings_raw:
                    if post:
                        try:
                            doc_id, position = post.strip("()").split(', ')
                            positional_index[term]["postings"][doc_id].append(int(position))
                        except ValueError:
                            continue
    except FileNotFoundError:
        print(f"Le fichier {file_path} est introuvable.")
    return positional_index

def main():
    """
    Point d'entrée principal pour le programme.
    """
    index = load_index_from_file('inverted_indexTF_filtered.txt')
    all_documents = set(doc_id for term_data in index.values() for doc_id in term_data["postings"].keys())

    print("Options :")
    print("1. Charger les requêtes depuis un fichier")
    print("2. Saisir les requêtes manuellement")
    choice = input("Choisissez une option (1 ou 2) : ").strip()

    if choice == "1":
        query_file = 'queries.2.project.ksa.txt'
        try:
            with open(query_file, 'r', encoding='utf-8') as file:
                queries = [line.strip() for line in file if line.strip()]
            with open('results.boolean.txt', 'w', encoding='utf-8') as output_file:
                for query_id, query in enumerate(queries, start=1):
                    result = evaluate_boolean_query(query, index, all_documents)
                    output_file.write(f"{query_id}, {', '.join(result)}\n")
            print("Résultats écrits dans 'results.boolean.txt'.")
        except FileNotFoundError:
            print(f"Le fichier {query_file} est introuvable.")
    elif choice == "2":
        print("Entrez vos requêtes (une par une) et tapez 'STOP' pour terminer.")
        while True:
            user_query = input("Requête : ").strip()
            if user_query.lower() == "stop":
                break
            result = evaluate_boolean_query(user_query, index, all_documents)
            print(f"Résultats : {', '.join(result)}")
    else:
        print("Choix invalide.")

if __name__ == "__main__":
    main()


# q1     Naseem                ==>   docs :   421 , 1 , 170 , 768 , 772
# q2     gosaibi OR Umm        ==>   docs :   3 , 4
# q3     Makkah AND Naseem     ==>   docs :   1 
# q4     Rawasi                ==>   docs :   23 , 750 
# q5     Classics              ==>   docs :   53 , 312 , 160 , 289 ,16 , 163 ,30 ,441 ,56 , 45 , 155
# q6     Naseem AND NOT Makkah   ===>  docs : '170', '421', '768', '772'


Options :
1. Charger les requêtes depuis un fichier
2. Saisir les requêtes manuellement


Choisissez une option (1 ou 2) :  2


Entrez vos requêtes (une par une) et tapez 'STOP' pour terminer.


Requête :  Makkah AND NOT Naseem


Résultats : '208', '215', '312', '359', '36', '376', '38', '429', '431', '44', '444', '445', '447', '45', '46', '47', '48', '51', '53', '55', '56', '59', '61', '62', '63', '68', '683', '70', '83'


Requête :  Naseem AND NOT Makkah 


Résultats : '170', '421', '768', '772'


Requête :  stop


# ***Proximity search / phrase search on collection 1***

In [13]:
from collections import defaultdict
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re

# Initialiser le stemmer et les stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Fonction pour charger l'index depuis un fichier
def load_index_from_file(file_path):
    positional_index = defaultdict(lambda: {"postings": defaultdict(list)})
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.split(', ', 1)
                if len(parts) != 2:
                    print(f"Ligne ignorée (format incorrect) : {line.strip()}")
                    continue
                term = parts[0].strip()
                postings_raw = parts[1].strip().strip("[]\n").split('), ')
                for post in postings_raw:
                    if post:
                        try:
                            doc_id, position = post.strip("()").split(', ')
                            positional_index[term]["postings"][doc_id].append(int(position))
                        except ValueError:
                            print(f"Post ignoré (format incorrect) : {post}")
                            continue
    except FileNotFoundError:
        print(f"Le fichier {file_path} est introuvable.")
    except Exception as e:
        print(f"Une erreur est survenue : {e}")
    return positional_index

# Fonction de pré-traitement pour les termes individuels
def preprocess_term(input_data):
    input_data = input_data.lower()
    input_data = re.sub(r'[^\w\s]', '', input_data)
    if input_data in stop_words:
        return None
    return stemmer.stem(input_data)

# Recherche de proximité exacte
def proximity_search_exact(query_terms, proximity_distance, index):
    query_terms = [preprocess_term(term) for term in query_terms]
    query_terms = [term for term in query_terms if term is not None]
    docs_with_proximity = []
    for term in query_terms:
        if term not in index:
            return []
    first_term = query_terms[0]
    for doc_id in index[first_term]['postings']:
        positions_first_term = index[first_term]['postings'][doc_id]
        all_terms_match = True
        for i in range(1, len(query_terms)):
            next_term = query_terms[i]
            if doc_id not in index[next_term]['postings']:
                all_terms_match = False
                break
            positions_next_term = index[next_term]['postings'][doc_id]
            exact_distance_found = False
            for pos1 in positions_first_term:
                for pos2 in positions_next_term:
                    if abs(pos1 - pos2) == proximity_distance:
                        exact_distance_found = True
                        break
                if exact_distance_found:
                    break
            if not exact_distance_found:
                all_terms_match = False
                break
        if all_terms_match:
            docs_with_proximity.append(doc_id)
    return list(set(docs_with_proximity))

# Recherche de phrases avec distance de 1 entre mots consécutifs
def phrase_search(phrase, index):
    terms = phrase.split()
    terms = [preprocess_term(term) for term in terms]
    terms = [term for term in terms if term is not None]
    if not terms or any(term not in index for term in terms):
        return []
    first_term = terms[0]
    candidate_docs = set(index[first_term]['postings'].keys())
    for i in range(1, len(terms)):
        next_term = terms[i]
        candidate_docs = candidate_docs.intersection(set(index[next_term]['postings'].keys()))
        if not candidate_docs:
            return []
    results = []
    for doc_id in candidate_docs:
        positions_first = index[terms[0]]['postings'][doc_id]
        match_found = True
        for i in range(1, len(terms)):
            positions_next = index[terms[i]]['postings'][doc_id]
            match_found = any(pos2 - pos1 == 1 for pos1 in positions_first for pos2 in positions_next)
            if not match_found:
                break
            positions_first = positions_next
        if match_found:
            results.append(doc_id)
    return results

# Recherche combinée
def proximity_search_combined(user_query, index, proximity_distance=1):
    """
    Recherche combinée interprétant une requête avec des opérateurs AND, OR et des phrases exactes.
    """
    terms = re.findall(r'\".*?\"|\S+', user_query)  # Diviser par termes ou phrases entre guillemets
    results = set()

    for term in terms:
        if term.upper() == "AND":
            continue
        elif term.upper() == "OR":
            results.update(next_results)
        elif term.startswith('"') and term.endswith('"'):  # Phrase exacte
            phrase = term.strip('"')
            next_results = set(phrase_search(phrase, index))
            results = results.intersection(next_results) if results else next_results
        else:  # Terme unique
            processed_term = preprocess_term(term)
            if processed_term and processed_term in index:
                next_results = set(index[processed_term]['postings'].keys())
                results = results.intersection(next_results) if results else next_results

    return list(results)

# Fonction pour exécuter une requête depuis un fichier et sauvegarder les résultats
def execute_query_from_file(file_path, index, output_file):
    with open(file_path, 'r', encoding='utf-8') as file:
        queries = file.readlines()
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for query in queries:
            query = query.strip()
            if query:
                print(f"Exécution de la requête : {query}")
                results = proximity_search_combined(query, index)
                out_file.write(f"Résultats pour '{query}': {results}\n")

# Fonction principale pour gérer les options utilisateur
def main():
    index = load_index_from_file('inverted_indexTF_filtered.txt')
    print("Choisissez une option :")
    print("1. Recherche simple")
    print("2. Recherche de proximité exacte")
    print("4. Recherche de phrase exacte")
    print("5. Exécuter des requêtes depuis un fichier .txt")
    choice = input("Votre choix (1/2/4/5) : ").strip()

    if choice == "1":
        user_query = input("Entrez votre requête : ")
        proximity_distance = 1
        result = proximity_search_exact(user_query.split(), proximity_distance, index)
        print(f"Résultats pour la recherche simple : {result}")
    elif choice == "2":
        user_query = input("Entrez une requête de proximité (termes séparés par des espaces) : ")
        proximity_distance = int(input("Entrez la distance EXACTE souhaitée : "))
        result = proximity_search_exact(user_query.split(), proximity_distance, index)
        print(f"Résultats pour la recherche de proximité exacte : {result}")
    elif choice == "4":
        phrase = input("Entrez une phrase exacte (entre guillemets) : ").strip('"')
        result = phrase_search(phrase, index)
        print(f"Résultats pour la recherche de phrase exacte : {result}")
    elif choice == "5":
        file_path = input("Entrez le chemin du fichier de requêtes (.txt) : ").strip()
        output_file = input("Entrez le chemin du fichier de sortie pour les résultats : ").strip()
        execute_query_from_file(file_path, index, output_file)
        print(f"Résultats enregistrés dans '{output_file}'.")
    else:
        print("Option invalide. Veuillez réessayer.")

# Exécuter le programme principal
if __name__ == "__main__":
    main()


# choix 1 :    "Qura Hotel"                                                      ===> docs: 4  
# choix 2 :     #6(Western junior)                                               ===> docs: 5  
# choix 2 :     #1(397,reviews)                                                  ===> docs: 2
# choix 4 :   "Makarem Umm Al Qura Hotel, Deluxe Twin Room,  6398 reviews"       ===> docs: 4
# choix 4 :   "Riyadh Marriott Hotel, Guest Room"                                ===> docs: 14


Choisissez une option :
1. Recherche simple
2. Recherche de proximité exacte
4. Recherche de phrase exacte
5. Exécuter des requêtes depuis un fichier .txt


Votre choix (1/2/4/5) :  4
Entrez une phrase exacte (entre guillemets) :  "Riyadh Marriott Hotel, Guest Room"


Résultats pour la recherche de phrase exacte : ["'14'"]



# ***Build TF-IDF Module***

In [15]:
import math
from collections import defaultdict


def compute_tfidf(term, doc_id, index, N):
    """
    Compute the TF-IDF score for a term in a specific document.
    """
    if term not in index:
        return 0
    
    # Term frequency in document (number of occurrences)
    tf = len(index[term]['postings'][doc_id])
    
    # Document frequency
    df = len(index[term]['postings'])
    
    # Compute TF-IDF
    tfidf = ((1+tf) * math.log(N / df)) if df > 0 else 0
    return tfidf

# exemple
# like: {'postings': defaultdict(<class 'list'>, {"'1'": [1, 3], "'2'": [1], "'3'": [2], "'4'": [2], "'5'": [1]})}



def rank_documents(query, index, N):
    """
    Rank documents for a given query using TF-IDF.
    """
    # Preprocess the query terms
    query_terms = [preprocess_term(term) for term in query.split()]
    query_terms = [term for term in query_terms if term is not None]  # Filter out None

    scores = defaultdict(float)  # Store scores for each document
    
    # Compute scores for all documents containing at least one query term
    for term in query_terms:
        if term in index:
            for doc_id in index[term]['postings']:
                scores[doc_id] += compute_tfidf(term, doc_id, index, N)
    
    # Sort documents by score (descending)
    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_docs


Run Queries

In [17]:
def process_queries(query_file, index, N, output_file):
    """
    Process queries from the query file and write ranked results to output_file.
    """
    with open(query_file, 'r') as qfile, open(output_file, 'w') as ofile:
        for line in qfile:
            query_id, query = line.strip().split(' ', 1)
            ranked_docs = rank_documents(query, index, N)
            
            # Write results to file in the specified format
            for doc_id, score in ranked_docs:
                ofile.write(f"{query_id},{doc_id},{score:.4f}\n")


Step 4: Main Program

In [19]:
# Fonction pour charger l'index depuis un fichier
def load_index_from_file(file_path):
    """
    Charge l'index positionnel depuis un fichier texte et le structure correctement
    pour une utilisation dans les fonctions de recherche.
    """
    positional_index = defaultdict(lambda: {"postings": defaultdict(list)})

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Diviser la ligne en terme et postings
                parts = line.split(', ', 1)  # Diviser une seule fois après la première virgule
                if len(parts) != 2:
                    print(f"Ligne ignorée (format incorrect) : {line.strip()}")
                    continue

                term = parts[0].strip()  # Le terme
                postings_raw = parts[1].strip().strip("[]\n").split('), ')  # Traiter les postings
                
                for post in postings_raw:
                    if post:
                        try:
                            doc_id, position = post.strip("()").split(', ')
                            positional_index[term]["postings"][doc_id].append(int(position))
                        except ValueError:
                            print(f"Post ignoré (format incorrect) : {post}")
                            continue
    except FileNotFoundError:
        print(f"")
    except Exception as e:
        print(f"Une erreur est survenue : {e}")

    return positional_index

# Charger l'index depuis le fichier

# ==========================================================================
def load_document_content(doc_id, content_file="KSA_DATA4.txt"):
    """
    Charge le contenu d'un document en fonction de son doc_id à partir d'un fichier.
    """
    try:
        with open(content_file, 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split(',', 1)  # Diviser doc_id et contenu
                if parts[0] == doc_id:
                    return parts[1]  # Retourner le contenu du document
    except FileNotFoundError:
        print(f"")
    return "Contenu non trouvé."




def display_documents_with_content(ranked_docs, content_file="KSA_DATA4.txt"):
    """
    Affiche les documents recommandés avec leur contenu.
    """
    print("\nDocuments recommandés :")
    for doc_id, score in ranked_docs[:5]:  # Limiter aux 5 premiers documents
        print(f"\nDocument ID: {doc_id} avec un score de {score:.4f}")
        content = load_document_content(doc_id, content_file)



# ===================================================================

if __name__ == "__main__":
    # Charger l'index
    index = load_index_from_file('inverted_indexTF_filtered.txt')
    
    # Nombre total de documents
    N = len(set(doc_id for postings in index.values() for doc_id in postings['postings']))
    
    query_file = 'queries.ranked.txt'  # File containing the 10 queries
    output_file = 'results.ranked.txt'  # Results file
    # Run ranked retrieval
    # process_queries(query_file, index, N)
    process_queries(query_file, index, N, output_file)  
    print(f"Results written to {output_file}")

    
    # Demander à l'utilisateur d'entrer une requête
    user_query = input("Entrez votre requête : ")
    
    
    # Récupérer les documents classés
    ranked_docs = rank_documents(user_query, index, N)

    # Afficher les résultats
    # display_results(user_query, ranked_docs)
    
    # Afficher le contenu des 5 premiers documents recommandés
    display_documents_with_content(ranked_docs)


# une requete pour le test:   Alreem Hotel in jeddah

Results written to results.ranked.txt


Entrez votre requête :  Alreem Hotel in jeddah



Documents recommandés :

Document ID: '29' avec un score de 15.1431

Document ID: '440' avec un score de 7.2407

Document ID: '369' avec un score de 7.2407

Document ID: '289' avec un score de 7.2407

Document ID: '590' avec un score de 7.2407


______________________________________________________
# Hotel Recommender System

## Reading the CSV File

In [21]:
import pandas as pd
df = pd.read_csv('./hotels_saudi.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Name,City,Price,Star_Rating,Property_Demand,Property_id,Customers_Rating,Customers_Review,Type_of_room,...,Canelation,Max_persons,Bed_type,Tax,Review_title,Credit_card,Breakfst_included,Longitude_x,Latitude_y,Link
0,1,Copper Crown Furnished Apartments,Khamis Mushayt,SAR 195,5,Only 2 rooms like this left on our site,5326174,9.0,169 reviews,Deluxe Room (2 Adults + 1 Child),...,FREE cancellation,Max persons: 2,1 bed\n(1 extra-large double),includes taxes and charges,Superb,Reservation possible without a credit card,,42.801402,18.242741,https://www.booking.com/hotel/sa/kwbr-krwn-lls...
1,3,Four Points by Sheraton Makkah Al Naseem,Makkah,SAR 225,5,Only 5 rooms like this left on our site,3889445,8.7,"9,418 reviews",Superior Twin Room,...,FREE cancellation,Max persons: 2,2 beds\n(2 singles),+SAR 23 taxes and charges,Fabulous,,,39.874312,21.38081,https://www.booking.com/hotel/sa/four-points-b...
2,7,Corp Inn Deira.,Riyadh,SAR 312,5,,254592,7.0,397 reviews,Grand Room,...,FREE cancellation,Max persons: 2,1 bed\n(1 large double),+SAR 47 taxes and charges,Good,Reservation possible without a credit card,,46.716504,24.634347,https://www.booking.com/hotel/sa/corp-city-cen...
3,8,Al Gosaibi Hotel,"Al Yarmouk, Al Khobar",SAR 340,5,,352664,8.1,"2,517 reviews",Superior Twin Room,...,FREE cancellation,Max persons: 2,2 beds\n(2 singles),+SAR 35 taxes and charges,Very good,,,50.220731,26.306971,https://www.booking.com/hotel/sa/al-gosaibi.en...
4,9,Makarem Umm Al Qura Hotel,"Ajyad, Makkah",SAR 350,5,,247347,7.7,"6,398 reviews",Deluxe Twin Room,...,,Max persons: 2,2 beds\n(2 singles),+SAR 36 taxes and charges,Good,,,39.830442,21.402693,https://www.booking.com/hotel/sa/umm-alqura-ma...


## Display a summary of a DataFrame

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            804 non-null    int64  
 1   Name                  804 non-null    object 
 2   City                  804 non-null    object 
 3   Price                 804 non-null    object 
 4   Star_Rating           804 non-null    int64  
 5   Property_Demand       576 non-null    object 
 6   Property_id           804 non-null    int64  
 7   Customers_Rating      804 non-null    float64
 8   Customers_Review      774 non-null    object 
 9   Type_of_room          804 non-null    object 
 10  reservations_Payment  569 non-null    object 
 11  Canelation            663 non-null    object 
 12  Max_persons           804 non-null    object 
 13  Bed_type              790 non-null    object 
 14  Tax                   804 non-null    object 
 15  Review_title          8

## Data preprocessing

In [27]:
# Specify the columns to keep
columns_to_keep = ['Name', 'City', 'Star_Rating', 'Type_of_room', 'Review_title', 'Longitude_x', 'Latitude_y']

# Keep only the specified columns
df = df[columns_to_keep]

df['Star_Rating'] = df['Star_Rating'].astype('str')
df['Type_of_room'] = df['Type_of_room'].astype('str')
df['Review_title'] = df['Review_title'].astype('str')

# Create the new 'new_features' column by combining the specified columns
df['new_features'] = df[['City', 'Star_Rating', 'Type_of_room','Review_title']].apply(lambda x: ' '.join(x), axis=1)

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Ensure you have the NLTK stopwords dataset
#nltk.download('stopwords')

# Download and get the list of stop words from NLTK
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Preprocessing
def process_text(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    # Filter out the stop words
    filtered_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words and word not in string.punctuation]
    # Join the remaining words back into a string
    return ' '.join(filtered_words)


# Apply the process_text function to the 'new_features' column
df['new_features'] = df['new_features'].apply(process_text)

## Applies the TF-IDF (Term Frequency-Inverse Document Frequency) technique to convert text data into a numerical format

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Transform the 'new_features' column into sparse matrix of TF-IDF values
word_matrix = vectorizer.fit_transform(df['new_features'])


# Convert the sparse matrix to a dense format and create a DataFrame
dense_matrix = word_matrix.toarray()
column_names = vectorizer.get_feature_names_out()

# Create the DataFrame with the proper row and column labels
word_matrix_df = pd.DataFrame(dense_matrix, columns=column_names)
word_matrix_df.iloc[50:60, 50:60]

Unnamed: 0,diplomat,doubl,duplex,economi,except,execut,extern,fabul,fairmont,famili
50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311346,0.0,0.0
53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,0.0,0.321322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,0.0,0.274737,0.0,0.0,0.0,0.0,0.0,0.431827,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.608469,0.0


## Computes and displays the cosine similarity matrix for the documents

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(word_matrix)
# Create a DataFrame for the cosine similarity matrix
cosine_sim_df = pd.DataFrame(sim, index=df['new_features'], columns=df['new_features'])

## **get_Recommendation** function is a hotel recommendation system.       
## uses cosine similarity between hotels to recommend similar ones based on (City, Star_Rating)

In [33]:
def get_Recommendation(title, df, sim, count=10):
    # Trouver l'index de l'hôtel correspondant au titre donné
    index = df.index[df['Name'].str.lower() == title.lower()]
    if len(index) == 0:
        return []
    if index[0] >= len(sim):
        return []
    
    # Extraire les similarités pour l'hôtel donné
    similarities = list(enumerate(sim[index[0]]))
    
    # Trier les hôtels en fonction des similarités décroissantes
    recommendations = sorted(similarities, key=lambda x: x[1], reverse=True)
    
    # Exclure l'hôtel actuel (l'index[0] correspond à l'hôtel de l'utilisateur)
    top_recs = [rec for rec in recommendations if rec[0] != index[0]]
    
    # Limiter le nombre de recommandations
    top_recs = top_recs[:count]
    
    titles = []
    
    # Collecter les informations des hôtels recommandés
    for rec in top_recs:
        if rec[0] < len(df):
            title = df.iloc[rec[0]]['Name']
            city = df.iloc[rec[0]]['City']
            rating = df.iloc[rec[0]]['Star_Rating']
            similarity_value = rec[1]  # Similarité entre l'hôtel demandé et l'hôtel recommandé
            titles.append(title)
            titles.append(city)
            titles.append(rating)
            titles.append(f"Similarity: {similarity_value:.4f}")
            titles.append('========================')
    
    return titles

## Save data (including the cosine similarity matrix and hotel information) into JSON files

In [35]:
import json

# Save the similarity matrix as a JSON file
with open('similarity_matrix.json', 'w') as f:
    json.dump(sim.tolist(), f)

# Specify the columns to save
columns_to_save = ['Name', 'City', 'Star_Rating', 'Type_of_room', 'Review_title', 'Longitude_x', 'Latitude_y']

# Select only the required columns
df_to_save = df[columns_to_save]

# Save the data to a JSON file
df_to_save.to_json('hotels_with_additional_info.json', orient='records', lines=False)


# Display the first rows of the similarity matrix
print(sim[:5])  

[[1.         0.03017541 0.02865604 ... 0.         0.09010918 0.21696354]
 [0.03017541 1.         0.04827249 ... 0.         0.22624196 0.        ]
 [0.02865604 0.04827249 1.         ... 0.16344316 0.         0.        ]
 [0.02132752 0.35394998 0.06366297 ... 0.         0.         0.        ]
 [0.12849128 0.41799178 0.08694865 ... 0.         0.14652636 0.        ]]


In [37]:
get_Recommendation(title ='Makarem Ajyad Makkah Hotel',df = df, sim = sim, count = 10)

['Lamar Ajyad Hotel 2 - Tower B',
 ' Ajyad, Makkah',
 '3',
 'Similarity: 0.8597',
 'Al Safwah Royale Orchid Hotel',
 ' Ajyad, Makkah',
 '5',
 'Similarity: 0.8449',
 'Al Shohada Hotel',
 ' Ajyad, Makkah',
 '5',
 'Similarity: 0.7483',
 'Elaf Al Salam',
 ' Ajyad, Makkah',
 '3',
 'Similarity: 0.7483',
 'Lamar Ajyad Hotel',
 ' Ajyad, Makkah',
 '3',
 'Similarity: 0.7483',
 'Rayanat Ajyad Hotel - Makkah',
 ' Ajyad, Makkah',
 '3',
 'Similarity: 0.7483',
 'Al Battal Hotel',
 ' Ajyad, Makkah',
 '2',
 'Similarity: 0.7483',
 'Ruba Al Hijaz Hotel',
 ' Makkah',
 '3',
 'Similarity: 0.7057',
 'Lamar Al Bait Hotel',
 ' Ajyad, Makkah',
 '3',
 'Similarity: 0.6972',
 'Makarem Umm Al Qura Hotel',
 ' Ajyad, Makkah',
 '5',
 'Similarity: 0.6852',