### Imports

In [28]:
from pathlib import Path
from sqlutils import sqlutils
import pandas as pd
import string
import matplotlib.pyplot as plt
import requests
import spacy
import nltk
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px

### Récupération des avis par restaurants

In [29]:
# Déterminer le chemin du script
script_path = Path().resolve()

# Select des avis depuis la base de données et depuis le chemin du script
bdd = sqlutils(script_path / "../../app/data/friands.db")
query = f"SELECT avis.id_restaurant, avis.contenu_avis, restaurants.nom FROM avis JOIN restaurants ON avis.id_restaurant = restaurants.id_restaurant"
success, t_avis = bdd.select(query)

if not success:
    print("Erreur lors de l'extraction des avis depuis la base de données")
    print(t_avis)
else:
    print(
        f"Extraction de {len(t_avis)} enregistrements depuis la base de données réussie"
    )

# # Insérer les champs extraits de la base de données dans un dataframe
df = pd.DataFrame(
    t_avis,
    columns=[
        "id_restaurant",
        "contenu_avis",
        "nom",
    ],
)

Extraction de 4723 enregistrements depuis la base de données réussie


### Fonctions de traitement du texte

In [30]:
# Table de transcription de la ponctuation et des chiffres
translation_table = str.maketrans(
    {char: " " for char in string.punctuation + string.digits}
)


def clean_text(text, stopwords=set()) -> str:
    """
    Clean the given text by converting to lowercase, removing ponctuation and numbers, and removing specified stopwords.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text.
    """
    text = text.lower().translate(translation_table)
    return " ".join(mot for mot in text.split() if mot not in stopwords)

In [31]:
# Récupération d'une liste de mots vides
url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-fr/master/stopwords-fr.txt"
response = requests.get(url)
if response.status_code == 200:
    mots_vides_github = response.text.splitlines()
else:
    print("Erreur lors de la récupération des mots vides depuis l'URL")

# Nettoyage des mots vides :
#     - suppression des caractères diacritiques
#     - conversion en minuscules
#     - suppression des chiffres
mots_vides_github = [clean_text(mot) for mot in mots_vides_github]

### Nettoyer les avis

In [32]:
# Application de la fonction de nettoyage sur le contenu des avis
df["avis_clean"] = df["contenu_avis"].apply(lambda x: clean_text(x, mots_vides_github))

### Lemmatisation

In [33]:
# Chargement du modèle français
nlp = spacy.load("fr_core_news_sm")


# Fonction de lemmatisation
def lemmatize_text(text, nlp):
    # Lemmatisation
    doc = nlp(text)
    lemmatized = [token.lemma_ for token in doc]

    return " ".join(lemmatized)

In [34]:
# Application de la fonction de lemmatisation sur le contenu des avis
df[f"avis_lemmatized"] = df["avis_clean"].apply(lambda avis: lemmatize_text(avis, nlp))

### Retirer les stop words en français

In [35]:
# retirer les stop words avec nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = stopwords.words("french")


# Fonction pour retirer les stop words
def remove_stopwords(text, stop_words):
    return " ".join(mot for mot in text.split() if mot not in stop_words)


# Application de la fonction de suppression des stop words sur le contenu des avis
df["avis_no_stopwords"] = df["avis_lemmatized"].apply(
    lambda avis: remove_stopwords(avis, stop_words)
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\beranger\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Word2Vec

In [36]:
import gensim
from gensim.models import Word2Vec


# Convert text data to list of words for each review
corpus_liste = [review.split() for review in df["avis_no_stopwords"]]
model = Word2Vec(corpus_liste, vector_size=3, window=3, min_count=1, epochs=10, seed=1)

# Train Word2Vec model on the corpus
model.build_vocab(corpus_liste)
model.train(corpus_liste, total_examples=model.corpus_count, epochs=model.epochs)

(1262103, 1422060)

In [37]:
# def plot_restaurant_similarities():
#     # Get average vector for each restaurant
#     restaurant_vectors = {}

#     # Calculate average vector for each restaurant's reviews
#     for restaurant in df["nom"].unique():
#         restaurant_reviews = df[df["nom"] == restaurant]["avis_no_stopwords"]
#         vectors = []

#         for review in restaurant_reviews:
#             words = review.split()
#             word_vectors = []
#             for word in words:
#                 if word in model.wv:
#                     word_vectors.append(model.wv[word])
#             if word_vectors:
#                 vectors.append(np.mean(word_vectors, axis=0))

#         if vectors:
#             restaurant_vectors[restaurant] = np.mean(vectors, axis=0)

#     # Create scatter plot
#     plt.figure(figsize=(12, 8))
#     colors = plt.cm.rainbow(np.linspace(0, 1, len(restaurant_vectors)))

#     for (restaurant, vector), color in zip(restaurant_vectors.items(), colors):
#         plt.scatter(vector[0], vector[1], c=[color], label=restaurant, s=100)

#     plt.title("Restaurant Similarity Based on Reviews")
#     plt.xlabel("Dimension 1")
#     plt.ylabel("Dimension 2")
#     plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
#     plt.tight_layout()
#     plt.show()


# # Call the function
# plot_restaurant_similarities()

In [38]:
# def plot_restaurant_similarities(themes_keywords):
#     """
#     Creates a scatter plot of restaurant similarities based on predefined themes

#     Args:
#         themes_keywords (dict): Dictionary where keys are theme names and values are lists of keywords
#     """
#     # Train Word2Vec model with higher dimensions for better representation
#     global model
#     model = Word2Vec(
#         corpus_liste, vector_size=100, window=5, min_count=1, epochs=10, seed=1
#     )
#     model.build_vocab(corpus_liste)
#     model.train(corpus_liste, total_examples=model.corpus_count, epochs=model.epochs)

#     # Get average vector for each restaurant
#     restaurant_vectors = {}

#     # Calculate theme vectors (average of keyword vectors)
#     theme_vectors = {}
#     for theme, keywords in themes_keywords.items():
#         theme_word_vectors = []
#         for keyword in keywords:
#             if keyword in model.wv:
#                 theme_word_vectors.append(model.wv[keyword])
#         if theme_word_vectors:
#             theme_vectors[theme] = np.mean(theme_word_vectors, axis=0)

#     # Calculate average vector for each restaurant's reviews
#     for restaurant in df["nom"].unique():
#         restaurant_reviews = df[df["nom"] == restaurant]["avis_no_stopwords"]
#         theme_scores = np.zeros(len(themes_keywords))

#         for review in restaurant_reviews:
#             words = review.split()
#             for word in words:
#                 if word in model.wv:
#                     # Calculate similarity with each theme
#                     for theme_idx, theme in enumerate(themes_keywords.keys()):
#                         if theme in theme_vectors:
#                             similarity = np.dot(model.wv[word], theme_vectors[theme])
#                             theme_scores[theme_idx] += similarity

#         if np.any(theme_scores):
#             restaurant_vectors[restaurant] = theme_scores / len(restaurant_reviews)

#     # Create scatter plot
#     plt.figure(figsize=(12, 8))
#     colors = plt.cm.rainbow(np.linspace(0, 1, len(restaurant_vectors)))

#     for (restaurant, vector), color in zip(restaurant_vectors.items(), colors):
#         plt.scatter(vector[0], vector[1], c=[color], label=restaurant, s=100)

#     # Set labels based on themes
#     plt.title("Restaurant Similarity Based on Themes")
#     plt.xlabel(list(themes_keywords.keys())[0])
#     plt.ylabel(list(themes_keywords.keys())[1])
#     plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
#     plt.tight_layout()
#     plt.show()


# # Example usage:
# themes = {
#     "Qualité": ["délicieux", "excellent", "savoureux", "bon", "qualité"],
#     "Service": ["service", "accueil", "sympathique", "agréable", "attentif"],
# }

# themes = {
#     "Prix": ["cher", "onereux", "couteux"],
#     "Qualité": ["delicieux", "bon", "savoureux", "qualite"],
# }
# plot_restaurant_similarities(themes)

In [39]:
# themes = {
#     "Qualité des plats": [
#         "Saveur",
#         "fraîcheur",
#         "présentation",
#         "ingrédients",
#         "goût",
#         "cuisson",
#         "assaisonnement",
#         "texture",
#         "originalité",
#         "authenticité",
#         "Arôme",
#         "équilibre",
#         "créativité",
#         "finesse",
#         "gourmandise",
#         "raffinement",
#         "harmonie",
#         "délicatesse",
#         "sophistication",
#         "innovation",
#         "Consistance",
#         "parfum",
#         "épices",
#         "tradition",
#         "modernité",
#         "simplicité",
#         "complexité",
#         "subtilité",
#         "richesse",
#         "légèreté",
#     ],
#     "Prix": [
#         "Abordable",
#         "raisonnable",
#         "économique",
#         "cher",
#         "coûteux",
#         "onéreux",
#         "bon marché",
#         "accessible",
#         "modéré",
#         "excessif",
#         "Valeur",
#         "rapport qualité-prix",
#         "budget",
#         "tarif",
#         "menu",
#         "addition",
#         "facture",
#         "dépense",
#         "économie",
#         "luxe",
#         "Promotion",
#         "réduction",
#         "offre",
#         "prix fixe",
#         "formule",
#         "forfait",
#         "gratuit",
#         "surtaxe",
#         "majoration",
#         "compétitif",
#     ],
#     "Ambiance": [
#         "Convivial",
#         "chaleureux",
#         "intime",
#         "décontracté",
#         "élégant",
#         "raffiné",
#         "cosy",
#         "accueillant",
#         "festif",
#         "romantique",
#         "Décor",
#         "lumière",
#         "musique",
#         "confort",
#         "design",
#         "atmosphère",
#         "ambiance",
#         "cadre",
#         "style",
#         "thème",
#         "Calme",
#         "animé",
#         "bruyant",
#         "serein",
#         "branché",
#         "traditionnel",
#         "moderne",
#         "vintage",
#         "rustique",
#         "sophistiqué",
#     ],
# }

In [40]:
def plot_restaurant_similarities():
    # Get average vector for each restaurant
    restaurant_vectors = {}
    for restaurant in df["nom"].unique():
        words_lists = df[df["nom"] == restaurant]["avis_no_stopwords"]
        vectors = []
        for review in words_lists:
            word_vectors = [
                model.wv[word] for word in review.split() if word in model.wv
            ]
            if word_vectors:
                vectors.append(np.mean(word_vectors, axis=0))
        if vectors:
            restaurant_vectors[restaurant] = np.mean(vectors, axis=0)

    # Reduce vectors to 3D
    names = list(restaurant_vectors.keys())
    vecs = list(restaurant_vectors.values())
    pca = PCA(n_components=3)
    reduced = pca.fit_transform(vecs)

    df_plot = pd.DataFrame(
        {
            "Restaurant": names,
            "Dim1": reduced[:, 0],
            "Dim2": reduced[:, 1],
            "Dim3": reduced[:, 2],
        }
    )

    # Assign colors to each restaurant
    unique_restaurants = df_plot["Restaurant"].unique()
    colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_restaurants)))
    color_map = {
        restaurant: color for restaurant, color in zip(unique_restaurants, colors)
    }
    df_plot["Color"] = df_plot["Restaurant"].map(color_map)

    fig = px.scatter_3d(
        df_plot,
        x="Dim1",
        y="Dim2",
        z="Dim3",
        text="Restaurant",
        color="Restaurant",
        title="Restaurant Similarity (3D)",
    )

    # Update layout for larger size
    fig.update_layout(
        width=1000,  # Width in pixels
        height=800,  # Height in pixels
        scene=dict(aspectmode="cube", aspectratio=dict(x=1, y=1, z=1)),
        margin=dict(l=0, r=0, b=0, t=30),  # Reduce margins
    )

    fig.update_traces(textposition="top center")
    fig.show()


plot_restaurant_similarities()

end_time = time()
print(f"Temps d'exécution : {end_time - start_time:.2f} secondes")

Temps d'exécution : 532.27 secondes


In [41]:
# def plot_restaurant_similarities():
#     # Get average vector for each restaurant
#     restaurant_vectors = {}
#     for restaurant in df["nom"].unique():
#         words_lists = df[df["nom"] == restaurant]["avis_no_stopwords"]
#         vectors = []
#         for review in words_lists:
#             word_vectors = [
#                 model.wv[word] for word in review.split() if word in model.wv
#             ]
#             if word_vectors:
#                 vectors.append(np.mean(word_vectors, axis=0))
#         if vectors:
#             restaurant_vectors[restaurant] = np.mean(vectors, axis=0)

#     # Reduce vectors to 3D
#     names = list(restaurant_vectors.keys())
#     vecs = list(restaurant_vectors.values())
#     pca = PCA(n_components=3)
#     reduced = pca.fit_transform(vecs)

#     df_plot = pd.DataFrame(
#         {
#             "Restaurant": names,
#             "Dim1": reduced[:, 0],
#             "Dim2": reduced[:, 1],
#             "Dim3": reduced[:, 2],
#         }
#     )

#     # Extract top 3 keywords for each dimension
#     components = pca.components_
#     keywords = []
#     for i in range(3):
#         component = components[i]
#         indices = np.argsort(component)[-3:]
#         keywords.append([model.wv.index_to_key[idx] for idx in indices])

#     # Assign colors to each restaurant
#     unique_restaurants = df_plot["Restaurant"].unique()
#     colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_restaurants)))
#     color_map = {
#         restaurant: color for restaurant, color in zip(unique_restaurants, colors)
#     }
#     df_plot["Color"] = df_plot["Restaurant"].map(color_map)

#     fig = px.scatter_3d(
#         df_plot,
#         x="Dim1",
#         y="Dim2",
#         z="Dim3",
#         text="Restaurant",
#         color="Restaurant",
#         title="Restaurant Similarity (3D)",
#     )

#     # Update layout for larger size and add keywords to axis labels
#     fig.update_layout(
#         width=1000,  # Width in pixels
#         height=800,  # Height in pixels
#         scene=dict(
#             aspectmode="cube",
#             aspectratio=dict(x=1, y=1, z=1),
#             xaxis_title=f"Dim1: {', '.join(keywords[0])}",
#             yaxis_title=f"Dim2: {', '.join(keywords[1])}",
#             zaxis_title=f"Dim3: {', '.join(keywords[2])}",
#         ),
#         margin=dict(l=0, r=0, b=0, t=30),  # Reduce margins
#     )

#     fig.update_traces(textposition="top center")
#     fig.show()


# plot_restaurant_similarities()

In [42]:
# def plot_restaurant_similarities_by_keywords(dim1_keyword, dim2_keyword, dim3_keyword):
#     """
#     Creates a 3D scatter plot of restaurant similarities based on similarity to specified keywords

#     Args:
#         dim1_keyword (str): Keyword for X axis
#         dim2_keyword (str): Keyword for Y axis
#         dim3_keyword (str): Keyword for Z axis
#     """
#     # Get average vector for each restaurant and calculate similarity with keywords
#     restaurant_vectors = {}

#     for restaurant in df["nom"].unique():
#         words_lists = df[df["nom"] == restaurant]["avis_no_stopwords"]
#         keyword_scores = np.zeros(3)
#         review_count = 0

#         for review in words_lists:
#             word_vectors = [
#                 model.wv[word] for word in review.split() if word in model.wv
#             ]
#             if word_vectors:
#                 review_vector = np.mean(word_vectors, axis=0)
#                 # Calculate similarity with each keyword
#                 if dim1_keyword in model.wv:
#                     keyword_scores[0] += np.dot(review_vector, model.wv[dim1_keyword])
#                 if dim2_keyword in model.wv:
#                     keyword_scores[1] += np.dot(review_vector, model.wv[dim2_keyword])
#                 if dim3_keyword in model.wv:
#                     keyword_scores[2] += np.dot(review_vector, model.wv[dim3_keyword])
#                 review_count += 1

#         if review_count > 0:
#             restaurant_vectors[restaurant] = keyword_scores / review_count

#     # Create DataFrame for plotting
#     df_plot = pd.DataFrame(
#         {
#             "Restaurant": list(restaurant_vectors.keys()),
#             "Dim1": [v[0] for v in restaurant_vectors.values()],
#             "Dim2": [v[1] for v in restaurant_vectors.values()],
#             "Dim3": [v[2] for v in restaurant_vectors.values()],
#         }
#     )

#     # Create 3D scatter plot
#     fig = px.scatter_3d(
#         df_plot,
#         x="Dim1",
#         y="Dim2",
#         z="Dim3",
#         text="Restaurant",
#         color="Restaurant",
#         title=f"Restaurant Similarity based on keywords: {dim1_keyword}, {dim2_keyword}, {dim3_keyword}",
#     )

#     # Update layout
#     fig.update_layout(
#         width=1000,
#         height=800,
#         scene=dict(
#             aspectmode="cube",
#             aspectratio=dict(x=1, y=1, z=1),
#             xaxis_title=f"Similarity to '{dim1_keyword}'",
#             yaxis_title=f"Similarity to '{dim2_keyword}'",
#             zaxis_title=f"Similarity to '{dim3_keyword}'",
#         ),
#         margin=dict(l=0, r=0, b=0, t=30),
#     )

#     fig.update_traces(textposition="top center")
#     fig.show()


# # Example usage:
# plot_restaurant_similarities_by_keywords("prix", "ambiance", "qualité")

In [43]:
# def plot_restaurant_similarities_by_themes(themes_keywords):
#     """
#     Creates a 3D scatter plot of restaurant similarities based on similarity to sets of keywords

#     Args:
#         themes_keywords (dict): Dictionary where keys are theme names and values are lists of keywords
#     """
#     # Get average vector for each restaurant and calculate similarity with keyword sets
#     restaurant_vectors = {}

#     for restaurant in df["nom"].unique():
#         words_lists = df[df["nom"] == restaurant]["avis_no_stopwords"]
#         keyword_scores = np.zeros(len(themes_keywords))
#         review_count = 0

#         for review in words_lists:
#             word_vectors = [
#                 model.wv[word] for word in review.split() if word in model.wv
#             ]
#             if word_vectors:
#                 review_vector = np.mean(word_vectors, axis=0)

#                 # Calculate similarity with each keyword set
#                 for dim_idx, (theme, keyword_set) in enumerate(themes_keywords.items()):
#                     set_scores = []
#                     for keyword in keyword_set:
#                         if keyword in model.wv:
#                             similarity = np.dot(review_vector, model.wv[keyword])
#                             set_scores.append(similarity)
#                     if set_scores:
#                         keyword_scores[dim_idx] += np.mean(set_scores)
#                 review_count += 1

#         if review_count > 0:
#             restaurant_vectors[restaurant] = keyword_scores / review_count

#     # Create DataFrame for plotting
#     df_plot = pd.DataFrame(
#         {
#             "Restaurant": list(restaurant_vectors.keys()),
#             "Dim1": [v[0] for v in restaurant_vectors.values()],
#             "Dim2": [v[1] for v in restaurant_vectors.values()],
#             "Dim3": [v[2] for v in restaurant_vectors.values()],
#         }
#     )

#     # Create 3D scatter plot
#     fig = px.scatter_3d(
#         df_plot,
#         x="Dim1",
#         y="Dim2",
#         z="Dim3",
#         text="Restaurant",
#         color="Restaurant",
#         title="Restaurant Similarity based on themes",
#     )

#     # Update layout
#     fig.update_layout(
#         width=1000,
#         height=800,
#         scene=dict(
#             aspectmode="cube",
#             aspectratio=dict(x=1, y=1, z=1),
#             xaxis_title=f"Similarity to '{list(themes_keywords.keys())[0]}'",
#             yaxis_title=f"Similarity to '{list(themes_keywords.keys())[1]}'",
#             zaxis_title=f"Similarity to '{list(themes_keywords.keys())[2]}'",
#         ),
#         margin=dict(l=0, r=0, b=0, t=30),
#     )

#     fig.update_traces(textposition="top center")
#     fig.show()


# # Example usage:
# plot_restaurant_similarities_by_themes(themes)