In [1]:
import os
import json 
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openai import OpenAI

In [2]:
# Load sheet metadata from sources.json
with open("sources.json", "r") as f:
    sources = json.load(f)

# Container for DataFrames
all_dfs = []

In [3]:
# Loop through each source and load the data
for source in sources:
    sheet_id = source["sheet_id"]
    sheet_name = source["sheet_name"]
    name = source["name"]

    # Construct URL
    url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

    # Read CSV from the URL
    try:
        df = pd.read_csv(url)
        df['source'] = name  # Add a column to identify where it came from
        all_dfs.append(df)
        print(f"Successfully Loaded: {name}")
    except Exception as e:
        print(f"Failed to load data {name}: {e}")

# Concatenate all DataFrames
combined_df = pd.concat(all_dfs, ignore_index=True)

Successfully Loaded: Amin


In [4]:
combined_df.head()

Unnamed: 0,Food_Name,Province,Meal_Type,Website,Pictures_Website,is_approved,source
0,بریانی,Isfahan,Food,https://chishi.ir/2768-beryani-esfehan/,,True,Amin
1,خورشت ماست,Isfahan,"Appetizer, Dessert",https://namnak.com/%D8%AE%D9%88%D8%B1%D8%B4-%D...,,True,Amin
2,دوغ و گوشفیل,Isfahan,"Drinks, Appetizer",https://fa.wikipedia.org/wiki/%DA%AF%D9%88%D8%...,,True,Amin
3,قیمه ریزه,Isfahan,Food,https://panamag.ir/616-gheyme-rize/,,True,Amin
4,گز,Isfahan,"Appetizer, Dessert",https://namnak.com/%D8%B7%D8%B1%D8%B2-%D8%AA%D...,,True,Amin


In [5]:
# Copy all records to df for manipulation
df = combined_df.copy()
df["id"] = combined_df.index + 1

# Normalize column names to avoid casing issues
df.columns = [col.strip().lower() for col in df.columns]
# Check for availability of is_approved and website columns
if "is_approved" not in df.columns or "website" not in df.columns:
    raise ValueError("Your DataFrame must contain 'is_approved' and 'website' columns.")

# To copy all the approved records in a seperate variable
approved_df = df[df["is_approved"] == True].copy()
approved_df

Unnamed: 0,food_name,province,meal_type,website,pictures_website,is_approved,source,id
0,بریانی,Isfahan,Food,https://chishi.ir/2768-beryani-esfehan/,,True,Amin,1
1,خورشت ماست,Isfahan,"Appetizer, Dessert",https://namnak.com/%D8%AE%D9%88%D8%B1%D8%B4-%D...,,True,Amin,2
2,دوغ و گوشفیل,Isfahan,"Drinks, Appetizer",https://fa.wikipedia.org/wiki/%DA%AF%D9%88%D8%...,,True,Amin,3
3,قیمه ریزه,Isfahan,Food,https://panamag.ir/616-gheyme-rize/,,True,Amin,4
4,گز,Isfahan,"Appetizer, Dessert",https://namnak.com/%D8%B7%D8%B1%D8%B2-%D8%AA%D...,,True,Amin,5
5,پولکی,Isfahan,Dessert,https://namnak.com/%D9%BE%D9%88%D9%84%DA%A9%DB...,,True,Amin,6
6,گوشت و لوبیا اصفهانی,Isfahan,Food,https://www.beytoote.com/cookery/ghazaha/pork1...,,True,Amin,7
15,آبگوشت یخنی نخود,Fars,Food,https://chishi.ir/32587-abgousht-yakhni-nokhod/,,True,Amin,16


In [6]:
# Make sure the 'foods' directory exists
os.makedirs("foods", exist_ok=True)

# Save the HTML content to a file named {id}.html inside 'foods' directory
def save_as_file(html_content, row_id):
    file_path = os.path.join("foods", f"{row_id}.html")
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(html_content)

In [7]:

html_records = []

for _, row in approved_df.iterrows():
    site_url = row["website"]
    row_id = row.get("id")  # make sure your dataframe has an 'id' column
    print(row_id)

    try:
        response = requests.get(site_url, timeout=5)
        html_content = response.text
        print(f"(website)->{site_url} — HTML length: {len(html_content)}")

        save_as_file(html_content=html_content, row_id= row_id)
        
        html_records.append({
            "id": row_id,
            "html": html_content
        })

    except requests.RequestException as e:
        print(f"Failed to fetch {site_url}: {e}")


1
(website)->https://chishi.ir/2768-beryani-esfehan/ — HTML length: 88003
2
(website)->https://namnak.com/%D8%AE%D9%88%D8%B1%D8%B4-%D9%85%D8%A7%D8%B3%D8%AA-%D8%A7%D8%B5%D9%81%D9%87%D8%A7%D9%86%DB%8C.p14793 — HTML length: 117558
3
(website)->https://fa.wikipedia.org/wiki/%DA%AF%D9%88%D8%B4%E2%80%8C%D9%81%DB%8C%D9%84#:~:text=%D9%BE%DB%8C%D9%88%D9%86%D8%AF%20%D8%A8%D9%87%20%D8%A8%DB%8C%D8%B1%D9%88%D9%86-,%D8%AF%D9%88%D8%BA%20%D9%88%20%DA%AF%D9%88%D8%B4%D9%81%DB%8C%D9%84%20%D8%A7%D8%B5%D9%81%D9%87%D8%A7%D9%86,%D8%AA%D8%A7%20%D8%A8%D9%87%20%D8%A7%D9%85%D8%B1%D9%88%D8%B2%20%D8%A7%D8%AF%D8%A7%D9%85%D9%87%20%D8%AF%D8%A7%D8%B1%D8%AF. — HTML length: 131388
4
(website)->https://panamag.ir/616-gheyme-rize/ — HTML length: 62232
5
(website)->https://namnak.com/%D8%B7%D8%B1%D8%B2-%D8%AA%D9%87%DB%8C%D9%87-%DA%AF%D8%B2.p75656 — HTML length: 126223
6
(website)->https://namnak.com/%D9%BE%D9%88%D9%84%DA%A9%DB%8C-%DA%A9%D9%86%D8%AC%D8%AF%DB%8C.p15152 — HTML length: 108856
7
(website)->https://www.beytoote.

In [8]:
# Load the base prompt
with open("data_extraction_prompt.txt", "r", encoding="utf-8") as file:
    base_prompt = file.read()

In [9]:
# html_content is your full HTML document as a string
soup = BeautifulSoup(html_records[0]["html"], "html.parser")

# Extract the content inside <body>
body_content = str(soup.body) if soup.body else ""  # Fallback to empty string if <body> is missing

# Combine the prompt and body
full_prompt = f"{base_prompt}\n\n{body_content}"

In [10]:
body_content

'<body><div class="wrapper-outer"><main id="main"><aside id="slide-out" style="display : none"><div class="search-mobile"><form action="https://www.google.com/search" id="searchform-mobile" method="get" target="_blank">\n<button class="search-button" type="submit" value="جستجو">جستجو کن !</button>\n<input name="domains" type="hidden" value="https://chishi.ir/"/>\n<input name="sitesearch" type="hidden" value="https://chishi.ir/"/>\n<input id="s-mobile" name="q" onblur="if (this.value == \'\') {this.value = \'جستجو\';}" onfocus="if (this.value == \'جستجو\') {this.value = \'\';}" title="جستجو" type="text" value="جستجو"/></form></div><div class="social-icons"></div><div id="mobile-menu"></div></aside><div class="boxed-all" id="wrapper"><div class="inner-wrapper"><header class="header"><div class="header-content"><div class="slide-out-open" id="slide-out-open"><span></span></div><div class="logo"><h2>\n<a href="https://chishi.ir/" title="سایت آموزشی چی شی"><img alt="سایت آموزشی چی شی" heigh

In [11]:
full_prompt

'You are a data extraction assistant. The following HTML content is from a Persian-language food website. Your task is to extract structured information about a Persian food item and return it in valid JSON format using the schema below.\n\nSchema:\n{\n    "title": "",\n    "location": {\n        "province": "",\n        "city": "",\n        "coordinates": {\n            "latitude": 0.0,\n            "longtitude": 0.0\n        }\n    },\n    "ingredients": [\n        {\n            "name": "",\n            "amount": 0,\n            "unit": ""\n        }\n    ],\n    "instructions": [\n        ""\n    ],\n    "meal_type": [\n        ""\n    ],\n    "occasion": [\n        ""\n    ]\n}\n\nImportant Notes:\n- Do not guess or hallucinate. If some information is not present, leave the field as an empty string or empty array as appropriate.\n- For coordinates, leave as `0.0` if not provided.\n- For units, use standard ones like "grams", "tablespoons", etc.\n- For `meal_type` and `occasion`, u

In [12]:
# Make sure Outputs directory exists
os.makedirs("Outputs", exist_ok=True)

def save_json(data, filename):
    # Choose a filename — using ID if available, or any identifier you like
    output_filename = os.path.join("Outputs", f"{filename}.json")
    
    # Save the extracted data to a JSON file
    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(json.loads(data), f, ensure_ascii=False, indent=4)

In [None]:
# Set your OpenAI API key
client = OpenAI(api_key="api-key")

# Send the full prompt to the API
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": full_prompt}
    ],
    temperature=0.2
)

# Get the extracted JSON data from the response
raw_response = response.choices[0].message.content

In [None]:
cleaned_response = raw_response.strip('```json\n').strip('\n```')

# parse to valid json format and then save it to a file
try:
    json_data = json.loads(cleaned_response)
    save_json(data=cleaned_response, filename=row_id)

except json.JSONDecodeError as e:
    print(f"Error parsing JSON: {e}")