# 04. Linking Reddit and BBC GoodFood

In [1]:
import requests               
import pandas as pd          
from scrapy import Selector 
import pprint 
from tqdm import tqdm
import spacy
import sys

# Import our own modules
sys.path.append("../scripts/")
import chadtools

In [None]:
bbclink = 'https://www.bbcgoodfood.com/search?q='
response = requests.get(bbclink)
sel = Selector(text=response.text)
links = []

recipecards = sel.css('main div.search-results div.card__section.card__content a ::attr(href)').getall()
links.extend(recipecards)  
prefix = "https://www.bbcgoodfood.com/recipes/"
final_links = [prefix + item for item in links] 
print(final_links)

In [17]:
base_url = 'https://www.bbcgoodfood.com/search?q=&limit=1000&page='

links = []
for i in tqdm(range(1, 11)):
    response = requests.get(base_url + str(i))
    sel = Selector(text = response.text)
    recipecards = sel.css('main div.search-results div.card__section.card__content a ::attr(href)').getall()
    links.extend(recipecards)  

prefix = "https://www.bbcgoodfood.com/recipes/"
final_links = [prefix + item for item in links] 

len(final_links)

100%|██████████| 10/10 [01:33<00:00,  9.38s/it]


10011

### Create a DataFrame containing the nutritional data of all the recipes on BBCGoodFood 

In [45]:
session = requests.Session()

def get_nutrition(url):
    response2 = session.get(url)
    sel2 = Selector(text=response2.text)
    table = sel2.css('table.key-value-blocks.hidden-print.mt-xxs')
    bbc = {}
    bbc['name'] = sel2.css('h1.heading-1 ::text').get()
    bbc['calories'] = table.css('td.key-value-blocks__value ::text').get()
    bbc['salt'] = table.xpath('.//*[contains(text(), "salt")]/..').css('td.key-value-blocks__value ::text').get()
    bbc['fat'] = table.xpath('.//*[contains(text(), "fat")]/..').css('td.key-value-blocks__value ::text').get()
    return bbc

bbc = [get_nutrition(url) for url in tqdm(final_links)]

df_bbc = pd.DataFrame(bbc)
df_bbc.head()

100%|██████████| 10011/10011 [2:44:43<00:00,  1.01it/s]


Unnamed: 0,name,calories,salt,fat
0,Chicken & chorizo jambalaya,445,1.2,10
1,Lemon drizzle cake,399,0.3,21
2,Chilli con carne recipe,387,2.32,17
3,Best ever chocolate brownies recipe,150,0.1,9
4,Creamy courgette lasagne,405,1.36,21


### Save the data as a CSV file 

In [47]:
df_bbc.to_csv('../data/bbc_data.csv', index=False)

In [27]:
df_filtered = pd.read_json('../data/cleaned_posts.json', orient='records')
df_filtered.tail()

Unnamed: 0,id,title,score,num_comments,ingredient_comment,created_utc,upvote_ratio,link_flair_text,author,url,comment_link,permalink
995,jhi1cz,Vegetarian Pumpkin Meatballs with a Heart of C...,57,6,\n\nHere another recipe with pumpkin!! A deli...,1603581735000,0.899902,Fruit\Vegetarian,italian_cook,https://www.reddit.com/gallery/jhi1cz,https://oauth.reddit.com/r/recipes/comments/jh...,https://reddit.com/r/recipes/comments/jhi1cz/v...
996,jcgb7j,Bitter gourd yogurt curry....with no bitternes...,7,6,Recipe.....\n\n[Short Video](https://youtu.be/...,1602879492000,0.649902,Fruit\Vegetarian,PassionateHobbies,https://i.redd.it/bpootodgbit51.jpg,https://oauth.reddit.com/r/recipes/comments/jc...,https://reddit.com/r/recipes/comments/jcgb7j/b...
997,iz12pg,Ottolenghi's Baked Orzo w/Mozzarella,22,5,Ingredients:\n\n* 7 Tablespoons olive oil\n* ...,1600970345000,0.830078,Fruit\Vegetarian,BrinaElka,https://i.redd.it/l7osuhkcm4p51.jpg,https://oauth.reddit.com/r/recipes/comments/iz...,https://reddit.com/r/recipes/comments/iz12pg/o...
998,iw3wli,Mushroom Barley Stew with Crispy Oyster Mushrooms,2694,41,**Recipe here originally:** [**Easy Mushroom B...,1600565227000,0.97998,Fruit\Vegetarian,BushyEyes,https://i.redd.it/511qxuct57o51.jpg,https://oauth.reddit.com/r/recipes/comments/iw...,https://reddit.com/r/recipes/comments/iw3wli/m...
999,isunwt,Easy Tomato Risotto with Parmesan,1800,35,**Recipe here originally:** [**Easy Tomato Ris...,1600122747000,0.990234,Fruit\Vegetarian,BushyEyes,https://i.redd.it/0qb76yy3m6n51.jpg,https://oauth.reddit.com/r/recipes/comments/is...,https://reddit.com/r/recipes/comments/isunwt/e...


In [8]:
df_bbc = pd.read_csv('../data/bbc_data.csv')
df_bbc.head()

Unnamed: 0,name,calories,salt,fat
0,Chicken & chorizo jambalaya,445.0,1.2,10.0
1,Lemon drizzle cake,399.0,0.3,21.0
2,Chilli con carne recipe,387.0,2.32,17.0
3,Best ever chocolate brownies recipe,150.0,0.1,9.0
4,Creamy courgette lasagne,405.0,1.36,21.0


In [26]:
len(df_filtered)

1000

In [42]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

df_reddit_test = df_filtered.head(15).copy()
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
similarity_threshold = 0.75
merge_columns = ['name', 'calories', 'fat', 'salt']

# Encode titles using Sentence Transformer
embeddings_reddit = model.encode(df_reddit_test['title'].tolist())
embeddings_bbc = model.encode(df_bbc['name'].tolist())

# Calculate cosine similarity matrix
cosine_sim_matrix = cosine_similarity(embeddings_reddit, embeddings_bbc)

# Find the indices and values of the maximum similarity in each row
max_similarity_indices = cosine_sim_matrix.argmax(axis=1)
max_similarity_values = cosine_sim_matrix.max(axis=1)

# Mask indices where the similarity is below the threshold
mask = max_similarity_values > similarity_threshold

# Merge columns based on the max similarity
df_reddit_test.loc[mask, merge_columns] = df_bbc.iloc[max_similarity_indices[mask]][merge_columns].values

df_reddit_test

  df_reddit_test.loc[mask, merge_columns] = df_bbc.iloc[max_similarity_indices[mask]][merge_columns].values


Unnamed: 0,id,title,score,num_comments,ingredient_comment,created_utc,upvote_ratio,link_flair_text,author,url,comment_link,permalink,name,calories,fat,salt
0,19d0wfc,Buffalo Chicken Tenders,202,12,**Recipe here originally:** [**Buffalo Chicken...,1705944195000,0.959961,Recipe,BushyEyes,https://i.redd.it/qtwisr8gz0ec1.jpeg,https://oauth.reddit.com/r/recipes/comments/19...,https://reddit.com/r/recipes/comments/19d0wfc/...,Buffalo chicken,520.0,35.0,7.4
1,1998zka,Prawn Katsu Baos,272,11,This one is high impact and a showstopper for ...,1705528588000,0.950195,Recipe,TheLuckiestDragon,https://i.redd.it/q81uyef4o2dc1.jpeg,https://oauth.reddit.com/r/recipes/comments/19...,https://reddit.com/r/recipes/comments/1998zka/...,Prawn katsu burgers,1070.0,74.0,2.8
2,18zcqmd,Cinnamon Rolls,256,21,"# Homemade Cinnamon Rolls\n\nFor full recipe, ...",1704476711000,0.959961,Recipe,pangibear,https://i.redd.it/7uef78dbsnac1.jpeg,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18zcqmd/...,Cinnamon rolls,415.0,18.0,0.9
3,18vf164,Mushroom-Taleggio Risotto,184,6,**Recipe here originally:** [**Mushroom-Talegg...,1704050722000,0.97998,Recipe,BushyEyes,https://i.redd.it/qc5akriilo9c1.jpeg,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18vf164/...,Mushroom risotto,445.0,17.0,1.45
4,18v7m3w,Cinnamon Oatmeal Chocolate Chip Cookies (Recipe),211,16,[RECIPE LINK](https://www.sarahfreia.com/blog/...,1704028771000,0.950195,Recipe,sarahfreia,https://i.redd.it/aki9a36yrm9c1.jpeg,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18v7m3w/...,,,,
5,18tse01,Turmeric Chicken Soup,138,9,**Recipe here originally:** [**Turmeric Chicke...,1703870926000,0.939941,Recipe,BushyEyes,https://i.redd.it/ssrzjdpzq99c1.jpeg,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18tse01/...,Turmeric tea,10.0,0.5,0.01
6,18s22fp,Bindaetteok (Korean Mung Bean Pancakes) - My F...,73,12,[Video recipe + additional tips here](https://...,1703687727000,0.990234,Recipe,stewonitwastaken,https://i.redd.it/5hhtoit1mu8c1.jpeg,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18s22fp/...,,,,
7,18okl8i,"Duck Breast with Sweet Potato Puree, Veggies a...",245,28,For full recipe and more visit: [https://www.m...,1703267200000,0.970215,Recipe,butchec,https://i.redd.it/p5hc39smvv7c1.jpeg,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18okl8i/...,Duck breasts with redcurrant & onion relish,435.0,31.0,1.02
8,18lvqpm,Scallops and Bacon with a Smoked Mackerel Beur...,295,22,For full recipe and more visit: [https://www.m...,1702968953000,0.970215,Recipe,butchec,https://i.redd.it/cgz77a8k877c1.jpeg,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18lvqpm/...,,,,
9,18lcd2k,Sprinkle Sugar Cookies,123,8,"# Sprinkle Sugar Cookies\n\nFor full recipe, t...",1702915781000,0.970215,Recipe,pangibear,https://i.redd.it/ys56hnjvu27c1.jpeg,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18lcd2k/...,Sugar cookies,130.0,5.0,0.03


In [47]:
selected_columns = ['title', 'name', 'calories', 'fat', 'salt'] 
df_selected = df_reddit_test[selected_columns]
df_selected

Unnamed: 0,title,name,calories,fat,salt
0,Buffalo Chicken Tenders,Buffalo chicken,520.0,35.0,7.4
1,Prawn Katsu Baos,Prawn katsu burgers,1070.0,74.0,2.8
2,Cinnamon Rolls,Cinnamon rolls,415.0,18.0,0.9
3,Mushroom-Taleggio Risotto,Mushroom risotto,445.0,17.0,1.45
4,Cinnamon Oatmeal Chocolate Chip Cookies (Recipe),,,,
5,Turmeric Chicken Soup,Turmeric tea,10.0,0.5,0.01
6,Bindaetteok (Korean Mung Bean Pancakes) - My F...,,,,
7,"Duck Breast with Sweet Potato Puree, Veggies a...",Duck breasts with redcurrant & onion relish,435.0,31.0,1.02
8,Scallops and Bacon with a Smoked Mackerel Beur...,,,,
9,Sprinkle Sugar Cookies,Sugar cookies,130.0,5.0,0.03


In [43]:
df_reddit_test.to_csv('../data/merged_data_test.csv', index=False)

In [21]:
title1 = "Creamy Lemon-Basil Chicken Pasta"
title2 = "Quick &amp; Easy Nut Brittle"
from sentence_transformers import SentenceTransformer
sentences = [title1, title2]

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences)
print(embeddings)


[[-0.02184611  0.00591048  0.00187744 ... -0.00569096 -0.02545492
  -0.01958655]
 [-0.06401303  0.01236008  0.00038102 ...  0.00542595 -0.02119795
  -0.02506197]]


In [22]:
from sklearn.metrics.pairwise import cosine_similarity
embedding1 = embeddings[0].reshape(1, -1)
embedding2 = embeddings[1].reshape(1, -1)

# Calculate cosine similarity
similarity_score = cosine_similarity(embedding1, embedding2)[0][0]
print(f"Cosine Similarity: {similarity_score}")

Cosine Similarity: 0.0866456851363182
