In [None]:
!pip install rdflib pandas tqdm

# Fill the Knowledge Graph with structured data

Convert the Recipes, Restaurants, and Nutrition datasets into RDF triples. Assign unique URIs using the base namespace http://kg-course.io/food-nutrition/ and add rdf:type statements to all entities using the schema from Task 1.

In [None]:
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SDO
from tqdm import tqdm
import re

Read the first 10000 rows of each dataset only to manage computational complexity and avoid scalability issues

In [None]:
recipes_df = pd.read_csv("data/cleaned_recipes.csv", nrows=10000)
restaurants_df = pd.read_csv("data/cleaned_restaurants.csv", nrows=10000)
nutrition_df = pd.read_csv("data/cleaned_nutrition.csv", nrows=10000)

In [None]:
# Define namespaces
BASE = Namespace("http://kg-course.io/food-nutrition/")
KGS = Namespace("http://kg-course.io/food-nutrition/schema/")

g = Graph()
g.bind("base", BASE)
g.bind("kgs", KGS)

g.parse("vocabulary.ttl", format="ttl") # parse the schema vocabulary

# Helper function to create unique URIs
def make_uri(entity_type, identifier):
    safe_id = re.sub(r'\W+', '_', str(identifier))
    return URIRef(BASE[f"{entity_type}/{safe_id}"])


## Convert recipes, restaurants, and nutrition information into RDF triples and add them to the graph

In [None]:
# Convert recipes to RDF
for _, row in tqdm(recipes_df.iterrows(), total=len(recipes_df)):
    
    recipe_uri = make_uri("recipe", row["RecipeId"])
    
    # rdf:type
    g.add((recipe_uri, RDF.type, SDO.Recipe))
    
    # Name
    if pd.notna(row["Name"]):
        g.add((recipe_uri, SDO.name, Literal(row["Name"], datatype=XSD.string)))

    # Category (Cuisine equivalent)
    if pd.notna(row["RecipeCategory"]):
        g.add((recipe_uri, SDO.recipeCategory, Literal(row["RecipeCategory"], datatype=XSD.string)))
    
    # CookTime
    if pd.notna(row["CookTime"]):
        g.add((recipe_uri, SDO.cookTime, Literal(float(row["CookTime"]), datatype=XSD.decimal)))
    
    # PrepTime
    if pd.notna(row["PrepTime"]):
        g.add((recipe_uri, SDO.prepTime, Literal(float(row["PrepTime"]), datatype=XSD.decimal)))
    
    # Keywords
    if pd.notna(row["Keywords"]):
        g.add((recipe_uri, SDO.keywords, Literal(row["Keywords"], datatype=XSD.string)))

    # Ingredients
    if pd.notna(row["RecipeIngredientParts"]):
        g.add((recipe_uri, SDO.recipeIngredient, Literal(row["RecipeIngredientParts"], datatype=XSD.string)))

    # Instructions
    if pd.notna(row["RecipeInstructions"]):
        g.add((recipe_uri, SDO.recipeInstructions, Literal(row["RecipeInstructions"], datatype=XSD.string)))

    if pd.notna(row["DatePublished"]):
        g.add((recipe_uri, SDO.datePublished, Literal(row["DatePublished"], datatype=XSD.date)))

    if pd.notna(row["Images"]):
        for image in row["Images"].split(', '):
            if image != '' or image is not None:
                g.add((recipe_uri, SDO.image, Literal(f"{image}", datatype=XSD.anyURI)))



In [None]:
# Convert restaurants to RDF
for _, row in tqdm(restaurants_df.iterrows(), total=len(restaurants_df)):
    
    restaurant_uri = make_uri("restaurant", row["Restaurant ID"])

    # rdf:type
    g.add((restaurant_uri, RDF.type, SDO.Restaurant))
    
    # Name
    if pd.notna(row["Restaurant Name"]):
        g.add((restaurant_uri, SDO.name, Literal(row["Restaurant Name"], datatype=XSD.string)))
    
    # Cuisine
    if pd.notna(row["Cuisines"]):
        g.add((restaurant_uri, SDO.servesCuisine, Literal(row["Cuisines"], datatype=XSD.string)))

    # Locality
    if pd.notna(row["Locality Verbose"]):
        g.add((restaurant_uri, SDO.addressLocality, Literal(row["Locality Verbose"], datatype=XSD.string)))

    # Currency
    if pd.notna(row["Currency"]):
        g.add((restaurant_uri, SDO.currenciesAccepted, Literal(row["Currency"], datatype=XSD.string)))

    # hasOnlineDelivery
    if pd.notna(row["Has Online delivery"]):
        g.add((restaurant_uri, KGS.hasOnlineDelivery, Literal(bool(row["Has Online delivery"]), datatype=XSD.boolean)))

    # AverageCostOfTwo
    if pd.notna(row["Average cost of two in USD"]):
        g.add((restaurant_uri, KGS.averageCostOfTwo, Literal(float(row["Average cost of two in USD"]), datatype=XSD.decimal)))

    # Rating
    if pd.notna(row["Aggregate rating"]):
        g.add((restaurant_uri, SDO.starRating, Literal(float(row["Aggregate rating"]), datatype=XSD.decimal)))


In [None]:
for i , row in tqdm(nutrition_df.iterrows(), total=len(nutrition_df)):
    nutrition_uri = make_uri("nutrition", i)
    g.add((nutrition_uri, RDF.type, SDO.NutritionInformation))
    g.add((nutrition_uri, SDO.calories, Literal(row["Calories"], datatype=XSD.decimal)))
    g.add((nutrition_uri, SDO.fatContent, Literal(row["FatContent"], datatype=XSD.decimal)))
    g.add((nutrition_uri, SDO.saturatedFatContent, Literal(row["SaturatedFatContent"], datatype=XSD.decimal)))
    g.add((nutrition_uri, SDO.cholesterolContent, Literal(row["CholesterolContent"], datatype=XSD.decimal)))
    g.add((nutrition_uri, SDO.sodiumContent, Literal(row["SodiumContent"], datatype=XSD.decimal)))
    g.add((nutrition_uri, SDO.carbohydrateContent, Literal(row["CarbohydrateContent"], datatype=XSD.decimal)))
    g.add((nutrition_uri, SDO.fiberContent, Literal(row["FiberContent"], datatype=XSD.decimal)))
    g.add((nutrition_uri, SDO.sugarContent, Literal(row["SugarContent"], datatype=XSD.decimal)))
    g.add((nutrition_uri, SDO.proteinContent, Literal(row["ProteinContent"], datatype=XSD.decimal)))

    # Link Nutrition â†’ Recipe
    recipe_ids = recipes_df[recipes_df['Name'] == row['Name']]['RecipeId'].values
    for recipe_id in recipe_ids:
        recipe_uri = make_uri("recipe", recipe_id)
        # print(nutrition_uri, recipe_id, recipe_uri)
        g.add((recipe_uri, KGS.hasNutrition, nutrition_uri))


In [None]:
g.serialize(destination="KEN4256-structured-KG-Team6.ttl", format="ttl")

In [None]:
print(g.serialize(format='ttl'))