In [1]:
!pip install rdflib pandas tqdm

Collecting rdflib
  Downloading rdflib-7.5.0-py3-none-any.whl.metadata (12 kB)
Downloading rdflib-7.5.0-py3-none-any.whl (587 kB)
   ---------------------------------------- 0.0/587.2 kB ? eta -:--:--
   ---------------------------------------- 0.0/587.2 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/587.2 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/587.2 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/587.2 kB ? eta -:--:--
   -------------------------------------- 587.2/587.2 kB 442.3 kB/s eta 0:00:00
Installing collected packages: rdflib
Successfully installed rdflib-7.5.0



[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Task 2.1 
Convert the Recipes, Restaurants, and Nutrition datasets into RDF triples. Assign unique URIs using the base namespace http://kg-course.io/food-nutrition/ and add rdf:type statements to all entities using the schema from Task 1.

In [17]:
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD
from tqdm import tqdm
import re

Read the first 10000 rows of each dataset only to manage computational complexity and avoid scalability issues

In [18]:
recipes_df = pd.read_csv("data/cleaned_recipes.csv", nrows=10000)
restaurants_df = pd.read_csv("data/cleaned_restaurants.csv", nrows=10000)
nutrition_df = pd.read_csv("data/cleaned_nutrition.csv", nrows=10000)

In [None]:
# Define namespaces
BASE = Namespace("http://kg-course.io/food-nutrition/")
SCHEMA = Namespace("http://kg-course.io/food-nutrition/schema/")

g = Graph()
g.bind("base", BASE)
g.bind("schema", SCHEMA)

g.parse("vocabulary.ttl", format="ttl") # parse the schema vocabulary

# Helper function to create unique URIs
def make_uri(entity_type, identifier):
    safe_id = re.sub(r'\W+', '_', str(identifier))
    return URIRef(BASE[f"{entity_type}/{safe_id}"])


In [None]:
# Convert recipes to RDF
for _, row in tqdm(recipes_df.iterrows(), total=len(recipes_df)):
    
    recipe_uri = make_uri("recipe", row["RecipeId"])
    
    # rdf:type
    g.add((recipe_uri, RDF.type, SCHEMA.Recipe))
    
    # Name
    if pd.notna(row["Name"]):
        g.add((recipe_uri, SCHEMA.name,
               Literal(row["Name"], datatype=XSD.string)))
    
    # Category (Cuisine equivalent)
    if pd.notna(row["RecipeCategory"]):
        g.add((recipe_uri, SCHEMA.category,
               Literal(row["RecipeCategory"], datatype=XSD.string)))
    
    # CookTime
    if pd.notna(row["CookTime"]):
        g.add((recipe_uri, SCHEMA.cookTime,
               Literal(float(row["CookTime"]), datatype=XSD.float)))
    
    # PrepTime
    if pd.notna(row["PrepTime"]):
        g.add((recipe_uri, SCHEMA.prepTime,
               Literal(float(row["PrepTime"]), datatype=XSD.float)))
    
    # DatePublished
    if pd.notna(row["DatePublished"]):
        g.add((recipe_uri, SCHEMA.datePublished,
               Literal(row["DatePublished"], datatype=XSD.dateTime)))


100%|██████████| 10000/10000 [00:03<00:00, 2510.18it/s]


In [21]:
restaurants_df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country,City,Address,Locality Verbose,Longitude,Latitude,Cuisines,Currency,Has Table booking,Has Online delivery,Aggregate rating,Rating color,Rating text,Votes,Average cost of two in USD
0,1600219,12212,India,Nashik,"Shop 10, Ramrajya Building 7, Samarth Nagar, N...","College Road, Nashik",73.754636,20.00669,Fast Food,Indian Rupees(Rs.),0,0,3.5,3.7,3,80,4.664
1,17057397,'Ohana,USA,Orlando,"1600 Seven Seas Drive, Lake Buena Vista, FL 32830","Disney World Area, Orlando",-81.585226,28.405437,Hawaiian,Dollar($),0,0,4.5,4.65,5,1151,45.0
2,18222559,{Niche} - Cafe & Bar,India,New Delhi,"2nd & 3rd Floor, M-16, M Block, Outer Circle, ...","Connaught Place, New Delhi",77.222507,28.631516,"North Indian, Chinese, Italian, Continental",Indian Rupees(Rs.),1,0,4.1,4.25,4,492,17.49
3,113702,@Mango,India,Ahmedabad,"Opposite Sindhu Bhawan, Bodakdev, Ahmedabad","Bodakdev, Ahmedabad",72.501764,23.040163,"North Indian, Continental, Mexican, Italian",Indian Rupees(Rs.),0,0,4.1,4.25,4,769,9.328
4,3100446,#45,India,Mangalore,"Ground Floor, Trinity Commercial Complex, Near...","Attavar, Mangalore",0.0,0.0,Cafe,Indian Rupees(Rs.),0,0,3.6,3.7,3,209,6.996


In [None]:
# Convert restaurants to RDF
for _, row in tqdm(restaurants_df.iterrows(), total=len(restaurants_df)):
    
    restaurant_uri = make_uri("restaurant", row["RestaurantId"])
    
    # rdf:type
    g.add((restaurant_uri, RDF.type, SCHEMA.Restaurant))
    
    # Name
    if pd.notna(row["Name"]):
        g.add((restaurant_uri, SCHEMA.name,
               Literal(row["Name"], datatype=XSD.string)))
    
    # Cuisine
    if "Cuisine" in row and pd.notna(row["Cuisine"]):
        g.add((restaurant_uri, SCHEMA.cuisine,
               Literal(row["Cuisine"], datatype=XSD.string)))
    
    # City
    if "City" in row and pd.notna(row["City"]):
        g.add((restaurant_uri, SCHEMA.city,
               Literal(row["City"], datatype=XSD.string)))

In [14]:
# Create a nutrition lookup dictionary for efficient access
# Normalize function
def normalize(text):
    return str(text).strip().lower()

# Build nutrition lookup by recipe name
nutrition_lookup = {}

for row in nutrition_df.itertuples():
    nutrition_lookup[normalize(row.Name)] = row

In [15]:
# Add calories and cuisine information to recipes
for row in tqdm(recipes_df.itertuples(), total=len(recipes_df)):
    
    recipe_uri = make_uri("recipe", row.RecipeId)
    recipe_name = normalize(row.Name)
    
    # rdf:type
    g.add((recipe_uri, RDF.type, SCHEMA.Recipe))
    
    # name
    g.add((recipe_uri, SCHEMA.name, 
           Literal(row.Name, datatype=XSD.string)))
    
    # description
    if recipe_name in nutrition_lookup:
        
        nutri = nutrition_lookup[recipe_name]
        
        # Cuisine (from RecipeCategory)
        if pd.notna(nutri.RecipeCategory):
            g.add((recipe_uri, SCHEMA.cuisine,
                   Literal(nutri.RecipeCategory, datatype=XSD.string)))
        
        # Calories
        if pd.notna(nutri.Calories):
            g.add((recipe_uri, SCHEMA.calories,
                   Literal(float(nutri.Calories), datatype=XSD.float)))

100%|██████████| 10000/10000 [00:01<00:00, 8595.74it/s]


In [16]:
for _, row in tqdm(recipes_df.iterrows(), total=len(recipes_df)):
    
    recipe_uri = make_uri("recipe", row["RecipeId"])
    
    # rdf:type
    g.add((recipe_uri, RDF.type, SCHEMA.Recipe))
    
    # Basic properties
    g.add((recipe_uri, SCHEMA.name, 
           Literal(row["Name"], datatype=XSD.string)))
    
    # Use RecipeCategory as cuisine/category
    if pd.notna(row["RecipeCategory"]):
        g.add((recipe_uri, SCHEMA.category, 
               Literal(row["RecipeCategory"], datatype=XSD.string)))
    
    # CookTime
    if pd.notna(row["CookTime"]):
        g.add((recipe_uri, SCHEMA.cookTime, 
               Literal(row["CookTime"], datatype=XSD.float)))
    
    # PrepTime
    if pd.notna(row["PrepTime"]):
        g.add((recipe_uri, SCHEMA.prepTime, 
               Literal(row["PrepTime"], datatype=XSD.float)))
    
    # DatePublished
    if pd.notna(row["DatePublished"]):
        g.add((recipe_uri, SCHEMA.datePublished, 
               Literal(row["DatePublished"], datatype=XSD.dateTime)))

100%|██████████| 10000/10000 [00:03<00:00, 3299.62it/s]


In [None]:
for _, row in tqdm(restaurants_df.iterrows(), total=len(restaurants_df)):
    restaurant_uri = make_uri("restaurant", row["restaurant_id"])
    
    g.add((restaurant_uri, RDF.type, SCHEMA.Restaurant))
    
    g.add((restaurant_uri, SCHEMA.name, Literal(row["name"], datatype=XSD.string)))
    g.add((restaurant_uri, SCHEMA.cuisine, Literal(row["cuisine"], datatype=XSD.string)))
    g.add((restaurant_uri, SCHEMA.city, Literal(row["city"], datatype=XSD.string)))


In [None]:
for _, row in tqdm(nutrition_df.iterrows(), total=len(nutrition_df)):
    recipe_uri = make_uri("recipe", row["recipe_id"])
    nutrition_uri = make_uri("nutrition", row["recipe_id"])
    
    g.add((nutrition_uri, RDF.type, SCHEMA.Nutrition))
    g.add((nutrition_uri, SCHEMA.protein, Literal(row["protein"], datatype=XSD.float)))
    g.add((nutrition_uri, SCHEMA.fat, Literal(row["fat"], datatype=XSD.float)))
    g.add((nutrition_uri, SCHEMA.carbs, Literal(row["carbs"], datatype=XSD.float)))
    
    # Link Nutrition → Recipe
    g.add((recipe_uri, SCHEMA.hasNutrition, nutrition_uri))


In [None]:
g.serialize(destination="KEN4256-structured-KG-Team6.ttl", format="ttl")