## Final Group Project NLP - Part 2

* Ahmed Mohamed Elghamry Shehata
* Ahmed Mahmoud Abdelmoneim Abdelhamid
* Noureldin Mohamed Abdelsalm Mohamed Hamedo
* Sergio Rodrigo Fernandez Testa

## Downloading the Data

In [None]:
!gdown 1lnoaa6tE2gGDQEEz0DW2hvOnjIMK9oTo

In [None]:
!unzip ./receipeData.zip

## Installation

In [None]:
!pip install --upgrade --force-reinstall numpy gensim
!pip install transformers datasets sentencepiece

## Imports

In [None]:
import pandas as pd

## Loading the Data

In [None]:
df=pd.read_csv("./data/dataset.csv", index_col=0)
df

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."
...,...,...,...,...,...,...
2231137,Sunny's Fake Crepes,"[""1/2 cup chocolate hazelnut spread (recommend...","[""Spread hazelnut spread on 1 side of each tor...",www.foodnetwork.com/recipes/sunny-anderson/sun...,Recipes1M,"[""chocolate hazelnut spread"", ""tortillas"", ""bu..."
2231138,Devil Eggs,"[""1 dozen eggs"", ""1 paprika"", ""1 salt and pepp...","[""Boil eggs on medium for 30mins."", ""Then cool...",cookpad.com/us/recipes/355411-devil-eggs,Recipes1M,"[""eggs"", ""paprika"", ""salt"", ""choice"", ""miracle..."
2231139,Extremely Easy and Quick - Namul Daikon Salad,"[""150 grams Daikon radish"", ""1 tbsp Sesame oil...","[""Julienne the daikon and squeeze out the exce...",cookpad.com/us/recipes/153324-extremely-easy-a...,Recipes1M,"[""radish"", ""Sesame oil"", ""White sesame seeds"",..."
2231140,Pan-Roasted Pork Chops With Apple Fritters,"[""1 cup apple cider"", ""6 tablespoons sugar"", ""...","[""In a large bowl, mix the apple cider with 4 ...",cooking.nytimes.com/recipes/1015164,Recipes1M,"[""apple cider"", ""sugar"", ""kosher salt"", ""bay l..."


In [None]:
recipes = df.to_dict(orient="records")
recipes[:2]

[{'title': 'No-Bake Nut Cookies',
  'ingredients': '["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]',
  'directions': '["In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.", "Stir over medium heat until mixture bubbles all over top.", "Boil and stir 5 minutes more. Take off heat.", "Stir in vanilla and cereal; mix well.", "Using 2 teaspoons, drop and shape into 30 clusters on wax paper.", "Let stand until firm, about 30 minutes."]',
  'link': 'www.cookbooks.com/Recipe-Details.aspx?id=44874',
  'source': 'Gathered',
  'NER': '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'},
 {'title': "Jewell Ball'S Chicken",
  'ingredients': '["1 small jar chipped beef, cut up", "4 boned chicken breasts", "1 can cream of mushroom soup", "1 carton sour cream"]',
  'directions': '["P

## Word2Vec

In [None]:
import json
# This cell process the ingredients for a Word2Vec
from gensim.utils import simple_preprocess

# Extracting elements from column NER...
corpus = []

for recipe in df["NER"]:
    ingredients = json.loads(recipe)
    for ing in ingredients:
        tokens = simple_preprocess(ing, deacc=True)
        if tokens:
            corpus.append(tokens)

print(corpus[:3])


[['brown', 'sugar'], ['milk'], ['vanilla']]


In [None]:
# This cell train the Word2Vec
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    sentences=corpus,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)

w2v_model.save("./data/embeddings.model")


In [4]:
# Checking how similar are some word
## NOTE: The example is bakery based
print(w2v_model.wv.most_similar("sugar"))
print(w2v_model.wv.similarity("milk", "cream"))

[('suger', 0.8257318139076233), ('usgar', 0.7939395308494568), ('sugah', 0.7874869704246521), ('sigar', 0.7562432885169983), ('domino', 0.7555842399597168), ('surgar', 0.7548619508743286), ('swerve', 0.732900857925415), ('colonial', 0.7252411246299744), ('sguar', 0.7230448126792908), ('suar', 0.703374445438385)]
0.52810544


In [7]:
# 3D Interactive plot
import plotly.express as px
from sklearn.decomposition import PCA

# Top 100 most freq words
words = list(w2v_model.wv.index_to_key)[:100]
vectors = [w2v_model.wv[word] for word in words]

# 3D PCA
pca = PCA(n_components=3)
coords = pca.fit_transform(vectors)

# Create DataFrame
df_plot = pd.DataFrame(coords, columns=["x", "y", "z"])
df_plot["word"] = words

# New color col
highlight_words = {"pork", "bacon", "cake"}
df_plot["color"] = df_plot["word"].apply(
    lambda w: "red" if w in highlight_words else "blue")

# Plot
fig = px.scatter_3d(
    df_plot,
    x="x", y="y", z="z",
    text="word",
    color="color",
    color_discrete_map={"red": "red", "blue": "blue"},
    title="Word2Vec Embedding (3D PCA)",
    width=800, height=600
)

fig.update_traces(marker=dict(size=5))
fig.show()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Asegúrate de que estén en el vocabulario
for word in ["pork", "bacon", "cake"]:
    if word not in w2v_model.wv:
        print(f"'{word}' not in vocabulary")

# Extraemos los vectores
vec_pork = w2v_model.wv["pork"]
vec_bacon = w2v_model.wv["bacon"]
vec_cake = w2v_model.wv["cake"]

# Calculamos similitudes
sim_pork_bacon = cosine_similarity([vec_pork], [vec_bacon])[0][0]
sim_pork_cake = cosine_similarity([vec_pork], [vec_cake])[0][0]

print(f"Similarity (pork vs bacon): {sim_pork_bacon:.4f}")
print(f"Similarity (pork vs cake):  {sim_pork_cake:.4f}")

# Veredicto
if sim_pork_bacon > sim_pork_cake:
    print("✅ Pork is semantically closer to bacon than to cake.")
else:
    print("⚠️ Unexpected result: pork is closer to cake.")


Similarity (pork vs bacon): 0.4290
Similarity (pork vs cake):  0.1966
✅ Pork is semantically closer to bacon than to cake.
