In [25]:
!pip install --no-cache-dir --upgrade numpy scipy gensim
!pip install --no-cache-dir pandas==2.2.2
!pip install dash
!pip install pyLDAvis
!pip install dash-bootstrap-components
!pip install compress_fasttext

Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting scipy
  Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m


In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
folder_path= ('/content/drive/MyDrive/ML Applications/recipes/')
# folder_path= ('/content/drive/MyDrive/recipes/')
# folder_path = "/content/drive/My Drive/MLA Project/recipes"
folder_path = "/content/drive/My Drive/3º uni/Machine Learning Applications/recipes/"
os.chdir(folder_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from dash import Dash, html, dcc, callback_context, no_update
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary, MmCorpus
import tempfile
import os
import plotly.express as px
import plotly.graph_objects as go
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State, ALL, MATCH
import ast
import numpy as np
import pandas as pd
import uuid

#### Page 1 LDA Topic Modeling

In [3]:
# Load model and data
ldag = LdaModel.load("lda_recipe_grouped_reviews.gensim")
num_topics = ldag.num_topics
corpus_bow_recipes = MmCorpus("recipes_bow.mm")
D_recipes = Dictionary.load("recipes_dictionary.dict")

# LDA visual
vis_data = gensimvis.prepare(ldag, corpus_bow_recipes, D_recipes)

In [4]:
import pandas as pd
grouped_reviews_df = pd.read_csv("recipe_reviews_grouped.csv")
recipes_df = pd.read_csv('recipe_df.csv')
relevant_allRecipes_html = {}

def get_relevant_topics(topicid):
    # Create list to return
    most_relevant_recipes = []
    items = [word for word, weight in ldag.show_topic(topicid)]

    # Compute topic weight for each document and sort by relevance
    sorted_docs = sorted(
        ((i, dict(ldag[doc]).get(topicid, 0)) for i, doc in enumerate(corpus_bow_recipes)),
        key=lambda x: x[1],
        reverse=True
    )

    # Extract top document IDs
    most_relevant_recipes = [doc_id for doc_id, _ in sorted_docs[:10]]
    return most_relevant_recipes

def get_recipe_name_cuisine(recipe_ids):
    top_5 = []
    for id in recipe_ids:
        recipe_id = grouped_reviews_df.iloc[id]['recipe_id']
        title= grouped_reviews_df[grouped_reviews_df['recipe_id'] == recipe_id]['recipe'].values[0]
        cuisine= recipes_df[recipes_df['recipe_id'] == recipe_id]['Cuisine'].values[0]
        top_5.append((title, cuisine))
    return top_5

In [5]:
for key in range(num_topics):
    rel_recipes_ids = get_relevant_topics(key)
    rel_recipes = get_recipe_name_cuisine(rel_recipes_ids)
    rel_recipes_html = [html.P(f"{title} - {cuisine}", className="card-text") for i, (title, cuisine) in enumerate(rel_recipes)]
    relevant_allRecipes_html[key] = rel_recipes_html

In [6]:
# Save the HTML to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".html", mode="w+", encoding="utf-8") as tmp:
    pyLDAvis.save_html(vis_data, tmp.name)
    tmp_html_path = tmp.name

# Read the HTML content
with open(tmp_html_path, 'r', encoding='utf-8') as f:
    html_content = f"""<html><head><style>
                            html, body {{
                                margin: 0;
                                padding: 0;
                                overflow: hidden;
                                height: 100vh;
                            }}
                            .scaled-wrapper {{
                                transform: scale(0.7);
                                transform-origin: top left;
                                width: calc(100% / 0.7);
                                height: calc(100% / 0.7);
                            }}
                        </style></head><body><div class="scaled-wrapper">{f.read()}</div></body></html>"""

# Clean up the temp file
os.unlink(tmp_html_path)

In [7]:
tab1 = dbc.Row(
[
    dbc.Col([
        html.Div(children=[
                html.Iframe(
                    srcDoc=html_content,
                    style={"width": "838px", "height": "544px", "border": "none"}
                )
            ], id = "lda_vis", style={
                "display": "inline-block",
                "width": "fit-content"
            })
    ]),
    dbc.Col([
        dbc.Stack(['Relevant recipes for ',
                    dbc.DropdownMenu(
                    label="choose topic",
                    size="sm",
                    children=[dbc.DropdownMenuItem(f"Topic {i+1}", id=f"relevant-topic-{i+1}", style={"fontSize": "0.7rem"}, n_clicks=0) for i in range(num_topics)],
                    style={"display": "inline-block", "margin-right": "10px", "font-size": "0.7rem"},
                    id="relevant-recipes-dd"
        )], direction='horizontal', gap=3),
        html.Hr(),
        dbc.Card([
                dbc.CardHeader("Results"),
                dbc.CardBody(id="relevant-recipes-results"),
            ],
            style={"width": "100%"},
            id="relevant-recipes-card"
        )
    ], style={"fontSize": "0.8rem"})
])

#### Page 2 Embedding Space Visualization

Things to take note of: this tab in the dashboard takes a while to load, so please be patient and avoid excessive clicking on the same button.

In [8]:
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA

g_fasttext_wv = KeyedVectors.load("model_fastText_grouped.wordvectors", mmap='r')
allWords = list(g_fasttext_wv.index_to_key)
vectors = np.array([g_fasttext_wv[word] for word in allWords])
pca = PCA(n_components=3)
reduced = pca.fit_transform(vectors)

In [9]:
import pickle

with open('grouped_ngrams.pkl', 'rb') as f:
    grouped_ngrams = pickle.load(f)

ngram_vecs = np.array([g_fasttext_wv[ngram] for ngram in grouped_ngrams])
ngram_red = pca.transform(ngram_vecs)

In [10]:
tab2 = dbc.Row([
    dbc.Col([
        dcc.Graph(
        id='3d-word-vectors',
        figure=go.Figure(),  # The interactive 3D plot
        style={'height': '80vh'}  # Make sure the plot is sufficiently large
    )
    ], width=8),
    dbc.Col([
        dbc.Card([
            dbc.CardHeader('Look for similar words'),
            dbc.CardBody([
                dbc.InputGroup([
                    dbc.Input(id="textbox-similar-vec", placeholder="Search similar words"),
                    dbc.Button("Find similar", id="btn-similar-vec", n_clicks=0),
                ], size="sm")
            ])
        ], class_name="mb-2"),
        dbc.Stack([
            'Corpus Ngrams',
            dbc.Button("Show", id="ngram-btn-show", n_clicks=0, size='sm', style={"fontSize": "0.8rem"}, color='success')
        ], class_name="mb-2", direction="horizontal", gap=2),
        dbc.Card([
            dbc.CardHeader('Word analogies'),
            dbc.CardBody([
                dbc.Col([
                    'Positive terms',
                    dbc.InputGroup([
                        dbc.Input(id="textbox-word-pos-default", placeholder="Positive word"),
                        dbc.Button("+", id={'type': 'add-button', 'group': 'pos'})
                    ], size="sm", class_name="mt-1"),
                    html.Div(id={'type': 'words-container', 'group': 'pos'}),
                    dcc.Store(id={'type': 'ids', 'group': 'pos'}, data=[])
                ], class_name="mb-2"),
                dbc.Col([
                    'Negative terms',
                    dbc.InputGroup([
                        dbc.Input(id="textbox-word-neg-default", placeholder="Negative word"),
                        dbc.Button("+", id={'type': 'add-button', 'group': 'neg'})
                    ], size="sm", class_name="mt-1"),
                    html.Div(id={'type': 'words-container', 'group': 'neg'}),
                    dcc.Store(id={'type': 'ids', 'group': 'neg'}, data=[])
                ], class_name="mb-2")
            ]),
            dbc.CardFooter([
                    dbc.Button("Compute", id="compute-btn", size='sm', style={"fontSize": "0.8rem"}, color='warning')
            ], style={"display": "flex", "justifyContent": "flex-end"})
        ])
    ], style={"fontSize": "0.8rem"})
])

#### Page 3 Recommender System

In [11]:
reviews_df = pd.read_csv('recipe_reviews_embeddings.csv')
reviews_df['embedding'] = reviews_df['embedding'].apply(ast.literal_eval)
grouped_reviews_df = pd.read_csv('recipes_embeddings.csv')
grouped_reviews_df['embedding'] = grouped_reviews_df['embedding'].apply(ast.literal_eval)
grouped_reviews_df = grouped_reviews_df[['recipe_id', 'recipe', 'filtered_nltk_lemmas', 'embedding']]
grouped_reviews_df = grouped_reviews_df.drop_duplicates(subset='recipe', keep='first')

min_reviews = 5
recipes_min_reviews = reviews_df['recipe_id'].value_counts()
recipes_min_reviews = recipes_min_reviews[recipes_min_reviews >= min_reviews].index
filtered_recipe_df = grouped_reviews_df[grouped_reviews_df['recipe_id'].isin(recipes_min_reviews)]

In [12]:
filtered_recipe_df.columns

Index(['recipe_id', 'recipe', 'filtered_nltk_lemmas', 'embedding'], dtype='object')

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(list(filtered_recipe_df['embedding']))

In [14]:
with open("Top_50.pkl", 'rb') as f:
    top_50 = pickle.load(f)

top_15 = top_50.iloc[:15]
most_pop = top_15['Title'].values.tolist()
recipe_id_to_index = {r_id: idx for idx, r_id in enumerate(filtered_recipe_df['recipe_id'])}
index_to_recipe_id = {idx: r_id for r_id, idx in recipe_id_to_index.items()}

In [15]:
tab3 = dbc.Row([
    dbc.Col([
        html.Div("Select some dishes you may like:"),
        html.Hr(),
        dbc.Checklist(
            options=[{'label': recipe, 'value': recipe} for recipe in most_pop],
            id='most-pop-list',
            inline=False
        ),
        html.Div(
            dbc.Button("Get recommendations", id='recommend-btn', color='primary', className='mt-2', size='sm'), style={"display": "flex", "justifyContent": "flex-end"}
        )
    ], width=5),
    dbc.Col([html.Div(style={'borderLeft': '1px solid #ccc', 'height': '100%', 'margin': '0 10px'})], style={"maxWidth": "20px", "padding": "0"}, width="auto"),
    dbc.Col([
        'Based on your selection, we have some recommendations.',
        html.Hr(),
        html.Ul(id='recommend-out')
    ])
])

## Main code to deploy the dashboard

In [16]:
# Create Dash app
app = Dash(external_stylesheets=[dbc.themes.BOOTSTRAP], suppress_callback_exceptions=True)
app.title = "Dashboard"

tabs = {'tab-1': tab1, 'tab-2': tab2, 'tab-3': tab3}

# main dashboard layout
app.layout = html.Div([
    dbc.Card([
        dbc.CardHeader(
            dbc.Tabs(
                [
                    dbc.Tab(label="LDA Topics", tab_id="tab-1"),
                    dbc.Tab(label="Embedding Space", tab_id="tab-2"),
                    dbc.Tab(label="Recommender System", tab_id="tab-3"),
                ],
                id="card-tabs",
                active_tab="tab-1",
            )
        ),
        dbc.CardBody(id="card-content"),
    ])
], style={"width": "100%", "height": "100vh", "fontSize": "0.8rem"})

# callback functions
@app.callback(Output("card-content", "children"), [Input("card-tabs", "active_tab")])
def tab_content(active_tab):
    return tabs[active_tab]

@app.callback(
    Output("relevant-recipes-dd", "label"),
    [Output(f"relevant-topic-{i+1}", "n_clicks") for i in range(num_topics)],
    Output("relevant-recipes-results", "children"),
    [Input(f"relevant-topic-{i+1}", "n_clicks") for i in range(num_topics)]
)
def update_dropdown_label(*args):
    if not callback_context.triggered:
        return "choose topic", *[0]*len(args), ""

    # Get the ID of the triggered dropdown item
    triggered_id = callback_context.triggered[0]["prop_id"].split(".")[0]

    # Extract topic index from ID
    if "relevant-topic-" in triggered_id:
        topic_num = int(triggered_id.split("-")[-1]) - 1
    else:
        topic_num = 0  # default fallback

    selected_label = f"Topic {topic_num + 1}"
    reset_n_clicks = [0] * len(args)

    return selected_label, *reset_n_clicks, relevant_allRecipes_html[topic_num]

@app.callback(
    Output('3d-word-vectors', 'figure'),
    [Input('btn-similar-vec', 'n_clicks'),
     Input('ngram-btn-show', 'n_clicks'),
     Input('compute-btn', 'n_clicks')],
    [State('textbox-similar-vec', 'value'),
     State('textbox-word-pos-default', 'value'),
    State('textbox-word-neg-default', 'value'),
     State({'type': 'input', 'group': 'pos', 'index': ALL}, 'value'),
     State({'type': 'input', 'group': 'neg', 'index': ALL}, 'value')],
    # prevent_initial_call=True
)
def unified_callback(n_clicks_similar, n_clicks_ngram, n_clicks_compute, input_word, pos_default, neg_default, pos_values, neg_values):
    trigger = callback_context.triggered_id  # Determine which button was clicked
    pos_values = [w for w in ([pos_default] + pos_values) if w]
    neg_values = [w for w in ([neg_default] + neg_values) if w]

    if trigger == 'btn-similar-vec' and input_word:
        similar_words = g_fasttext_wv.most_similar(input_word, topn=50)
        words = [input_word] + [word for word, _ in similar_words]
        vectors = [g_fasttext_wv[word] for word in words]
        reduced_vectors = pca.transform(vectors)
        plot_title = f"50 similar words to {input_word}"
        colors = "blue"

    elif trigger == 'ngram-btn-show':
        words = grouped_ngrams
        reduced_vectors = ngram_red
        plot_title = "Showing all Ngrams"
        colors = "blue"

    elif trigger == 'compute-btn' and (pos_values or neg_values):
        result_word, similarity = g_fasttext_wv.most_similar(positive = pos_values, negative = neg_values, topn = 1)[0]
        words = pos_values + neg_values + [result_word]
        total_vectors = [g_fasttext_wv[word] for word in words]
        reduced_vectors = pca.transform(total_vectors)
        plot_title = f"{' + '.join(pos_values)} - {' - '.join(neg_values)} ≈ {result_word}"
        colors = ['red' if word == result_word else 'blue' for word in words]

    else:
        subset_indices = np.random.choice(len(allWords), size=300, replace=False)
        words = [allWords[i] for i in subset_indices]
        vectors = [g_fasttext_wv[word] for word in words]
        reduced_vectors = pca.transform(vectors)
        plot_title = "Showing 300 randomly selected embeddings"
        colors = "blue"

    # 3D plot
    x, y, z = reduced_vectors[:, 0], reduced_vectors[:, 1], reduced_vectors[:, 2]
    trace = go.Scatter3d(
        x=x, y=y, z=z,
        mode='markers+text',
        marker=dict(size=5, color=colors, opacity=0.6),
        text=words,
        textposition="top center",
    )

    if trigger == 'compute-btn' and (pos_values or neg_values):
        result_x, result_y, result_z = reduced_vectors[-1]
        line_segments = [
            go.Scatter3d(
                x=[reduced_vectors[i, 0], result_x],
                y=[reduced_vectors[i, 1], result_y],
                z=[reduced_vectors[i, 2], result_z],
                mode='lines',
                line=dict(color='gray', width=2),
                showlegend=False
            )
            for i in range(len(words) - 1)
        ]
        data = [trace] + line_segments
    else:
        data = [trace]

    layout = go.Layout(
        title=plot_title,
        scene=dict(
            xaxis_title='PC1',
            yaxis_title='PC2',
            zaxis_title='PC3'
        ),
        margin=dict(l=0, r=0, b=0, t=40),
        dragmode='orbit'
    )

    return go.Figure(data=data, layout=layout)

@app.callback(
    Output({'type': 'words-container', 'group': MATCH}, 'children'),
    Output({'type': 'ids', 'group': MATCH}, 'data'),
    Input({'type': 'add-button', 'group': MATCH}, 'n_clicks'),
    State({'type': 'ids', 'group': MATCH}, 'data'),
    prevent_initial_call=True
)
def add_input(n_clicks, ids):
    if len(ids) >= 3:
        return no_update, no_update
    new_id = str(uuid.uuid4())
    ids.append(new_id)
    group = callback_context.triggered_id['group']
    return [generate_input_group(group, i) for i in ids], ids

def generate_input_group(group, uid):
    ph_text = "Positive word" if group == "pos" else "Negative word"
    return dbc.InputGroup([
        dbc.Input(id={'type': 'input', 'group': group, 'index': uid}, placeholder=ph_text, size="sm"),
        dbc.Button("-", id={'type': 'remove-button', 'group': group, 'index': uid}, size="sm", color="danger")
    ], class_name="mt-1", id={'type': 'group', 'group': group, 'index': uid})

@app.callback(
    Output({'type': 'words-container', 'group': MATCH}, 'children', allow_duplicate=True),
    Output({'type': 'ids', 'group': MATCH}, 'data', allow_duplicate=True),
    Input({'type': 'remove-button', 'group': MATCH, 'index': ALL}, 'n_clicks'),
    State({'type': 'ids', 'group': MATCH}, 'data'),
    prevent_initial_call='initial_duplicate'
)
def remove_input(n_clicks_list, ids):
    triggered = callback_context.triggered_id
    if not triggered or 'index' not in triggered:
        return no_update, no_update

    triggered_index = triggered['index']
    if triggered_index in ids:
        idx = ids.index(triggered_index)
        n_clicks = n_clicks_list[idx]
        if isinstance(n_clicks, int) and n_clicks > 0:
            updated_ids = [uid for uid in ids if uid != triggered_index]
            group = triggered['group']
            return [generate_input_group(group, i) for i in updated_ids], updated_ids

    return no_update, no_update

@app.callback(
    Output('recommend-out', 'children'),
    Input('recommend-btn', 'n_clicks'),
    State('most-pop-list', 'value'),
    prevent_initial_call=True
)
def generate_recommendations(n_clicks, selected):
    recipes_liked = top_15[top_15["Title"].isin(selected)]["recipe_id"].tolist()
    result_content = []

    for recipe_id in recipes_liked:
        if recipe_id not in recipe_id_to_index:
            continue

        idx = recipe_id_to_index[recipe_id]

        title = filtered_recipe_df.loc[filtered_recipe_df['recipe_id'] == recipe_id, 'recipe'].values[0]
        similar_indices = np.argsort(similarity_matrix[idx])[::-1]
        similar_indices = [i for i in similar_indices if i != idx][:3]

        for sim_idx in similar_indices:
            similar_recipe_id = index_to_recipe_id[sim_idx]
            similar_title = filtered_recipe_df.loc[filtered_recipe_df['recipe_id'] == similar_recipe_id, 'recipe'].values[0]
            sim_score = similarity_matrix[idx][sim_idx]
            result_content.append(html.Li(similar_title))
    return result_content


# Run the app
if __name__ == "__main__":
    app.run(debug=True)

<IPython.core.display.Javascript object>