In [1]:
!pip install rdflib pandas tqdm




[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Task 2.1 
Convert the Recipes, Restaurants, and Nutrition datasets into RDF triples. Assign unique URIs using the base namespace http://kg-course.io/food-nutrition/ and add rdf:type statements to all entities using the schema from Task 1.

In [2]:
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD
from tqdm import tqdm
import re

In [3]:
# Hint 1: Read the first 10000 rows of each dataset only to manage computational complexity and avoid scalability issues
recipes_df = pd.read_csv("data/cleaned_recipes.csv", nrows=10000)
restaurants_df = pd.read_csv("data/cleaned_restaurants.csv", nrows=10000)
nutrition_df = pd.read_csv("data/cleaned_nutrition.csv", nrows=10000)

In [4]:
from rdflib import Graph, Namespace
from rdflib.namespace import RDF, RDFS, XSD

g = Graph()

# Load schema from Task 1
g.parse("vocabulary.ttl", format="turtle")

# Namespaces
BASE = Namespace("http://kg-course.io/food-nutrition/")
EX = Namespace("http://kg-course.io/food-nutrition/schema/")
SCHEMA = Namespace("https://schema.org/")

g.bind("base", BASE)
g.bind("ex", EX)
g.bind("schema", SCHEMA)

In [6]:
# Hint 2: Link recipe data and restaurant data by string match on keywords and cuisine
# Let's create a mapping of keywords and cuisines to recipe and restaurant URIs for efficient linking\

recipe_cache = {}
restaurant_cache = {}
nutrition_cache = {}

In [7]:
# Helper function to create unique URIs
def make_uri(entity_type, identifier):
    safe_id = re.sub(r'\W+', '_', str(identifier))
    return URIRef(BASE[f"{entity_type}/{safe_id}"])

In [8]:
# Recipe loop with the entity cache
for _, row in tqdm(recipes_df.iterrows(), total=len(recipes_df)):

    recipe_id = row["RecipeId"]
    
    if recipe_id in recipe_cache:
        recipe_uri = recipe_cache[recipe_id]
    else:
        recipe_uri = make_uri("recipe", recipe_id)
        recipe_cache[recipe_id] = recipe_uri
        g.add((recipe_uri, RDF.type, SCHEMA.Recipe))

    # Name
    if pd.notna(row["Name"]):
        g.set((recipe_uri, SCHEMA.name,
               Literal(row["Name"], datatype=XSD.string)))

    # Category
    if pd.notna(row["RecipeCategory"]):
        g.set((recipe_uri, SCHEMA.recipeCategory,
               Literal(row["RecipeCategory"], datatype=XSD.string)))

    # CookTime
    if pd.notna(row["CookTime"]):
        g.set((recipe_uri, SCHEMA.cookTime,
               Literal(float(row["CookTime"]), datatype=XSD.decimal)))

    # PrepTime
    if pd.notna(row["PrepTime"]):
        g.set((recipe_uri, SCHEMA.prepTime,
               Literal(float(row["PrepTime"]), datatype=XSD.decimal)))

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:05<00:00, 1913.23it/s]


In [9]:
# Nutrition loop with the entity cache
for _, row in tqdm(nutrition_df.iterrows(), total=len(nutrition_df)):

    recipe_name = row["Name"]

    if recipe_name in nutrition_cache:
        nutrition_uri = nutrition_cache[recipe_name]
    else:
        nutrition_uri = make_uri("nutrition", recipe_name)
        nutrition_cache[recipe_name] = nutrition_uri
        g.add((nutrition_uri, RDF.type, SCHEMA.NutritionInformation))

    recipe_uri = make_uri("recipe", recipe_name)

    # Link only once
    g.add((recipe_uri, EX.hasNutrition, nutrition_uri))

    if pd.notna(row["Calories"]):
        g.set((nutrition_uri, SCHEMA.calories,
               Literal(float(row["Calories"]), datatype=XSD.decimal)))

    if pd.notna(row["FatContent"]):
        g.set((nutrition_uri, SCHEMA.fatContent,
               Literal(float(row["FatContent"]), datatype=XSD.decimal)))

    if pd.notna(row["ProteinContent"]):
        g.set((nutrition_uri, SCHEMA.proteinContent,
               Literal(float(row["ProteinContent"]), datatype=XSD.decimal)))

100%|██████████| 10000/10000 [00:03<00:00, 2638.83it/s]


In [10]:
# Restaurant loop with the entity cache
for _, row in tqdm(restaurants_df.iterrows(), total=len(restaurants_df)):

    rest_id = row["Restaurant ID"]

    if rest_id in restaurant_cache:
        restaurant_uri = restaurant_cache[rest_id]
    else:
        restaurant_uri = make_uri("restaurant", rest_id)
        restaurant_cache[rest_id] = restaurant_uri
        g.add((restaurant_uri, RDF.type, SCHEMA.Restaurant))

    if pd.notna(row["Restaurant Name"]):
        g.set((restaurant_uri, SCHEMA.name,
               Literal(row["Restaurant Name"], datatype=XSD.string)))

    if pd.notna(row["Country"]):
        g.set((restaurant_uri, SCHEMA.addressCountry,
               Literal(row["Country"], datatype=XSD.string)))

    if pd.notna(row["City"]):
        g.set((restaurant_uri, SCHEMA.addressLocality,
               Literal(row["City"], datatype=XSD.string)))

    if pd.notna(row["Cuisines"]):
        g.set((restaurant_uri, SCHEMA.servesCuisine,
               Literal(row["Cuisines"], datatype=XSD.string)))

    if pd.notna(row["Aggregate rating"]):
        g.set((restaurant_uri, SCHEMA.starRating,
               Literal(float(row["Aggregate rating"]), datatype=XSD.decimal)))

    if pd.notna(row["Average cost of two in USD"]):
        g.set((restaurant_uri, EX.averageCostOfTwo,
               Literal(float(row["Average cost of two in USD"]), datatype=XSD.decimal)))

100%|██████████| 9550/9550 [00:04<00:00, 2045.79it/s]


In [11]:
# Helper functions for normalization and parsing R list strings
#def normalize(text):
#    return str(text).strip().lower()

#def parse_r_list(r_string):
#    if pd.isna(r_string):
#        return []
#    cleaned = re.sub(r'^c\(|\)$', '', r_string)
#    return re.findall(r'"(.*?)"', cleaned)

# Build restaurant Cuisine Index'
#restaurant_by_cuisine = {}

#for _, row in restaurants_df.iterrows():
    
#    restaurant_uri = make_uri("restaurant", row["Restaurant ID"])
    
#    if pd.notna(row["Cuisines"]):
#        cuisines = row["Cuisines"].split(",")
        
#        for cuisine in cuisines:
#            cuisine_norm = normalize(cuisine)
#            restaurant_by_cuisine.setdefault(cuisine_norm, []).append(restaurant_uri)
# Link recipes to restaurants
#for _, row in tqdm(recipes_df.iterrows(), total=len(recipes_df)):
    
#    recipe_uri = make_uri("recipe", row["RecipeId"])
    
    # Collect possible cuisine keywords from recipe
#    candidate_terms = []
    
    # From RecipeCategory
#    if pd.notna(row["RecipeCategory"]):
#        candidate_terms.append(normalize(row["RecipeCategory"]))
    
    # From Keywords (R-style list)
#    if pd.notna(row["Keywords"]):
#        keywords = parse_r_list(row["Keywords"])
#        candidate_terms.extend([normalize(k) for k in keywords])
    
    # Match against restaurant cuisines
#    for term in candidate_terms:
#        if term in restaurant_by_cuisine:
            
#            for restaurant_uri in restaurant_by_cuisine[term]:
                
                # Link both directions (optional but recommended)
#                g.add((recipe_uri, SCHEMA.servedBy, restaurant_uri))
#                g.add((restaurant_uri, SCHEMA.servesRecipe, recipe_uri))

In [12]:
# Show number of triples
print(f"Total triples in graph: {len(g)}")

Total triples in graph: 165214


In [13]:
# Print first 25 triples
for i, triple in enumerate(g):
    print(triple)
    if i >= 24:
        break

(rdflib.term.URIRef('http://kg-course.io/food-nutrition/recipe/7261'), rdflib.term.URIRef('https://schema.org/cookTime'), rdflib.term.Literal('0.0', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('http://kg-course.io/food-nutrition/recipe/Cajun_Deep_Fried_Turkey'), rdflib.term.URIRef('http://kg-course.io/food-nutrition/schema/hasNutrition'), rdflib.term.URIRef('http://kg-course.io/food-nutrition/nutrition/Cajun_Deep_Fried_Turkey'))
(rdflib.term.URIRef('http://kg-course.io/food-nutrition/restaurant/18407918'), rdflib.term.URIRef('https://schema.org/addressLocality'), rdflib.term.Literal('Bangalore', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://kg-course.io/food-nutrition/restaurant/18481310'), rdflib.term.URIRef('https://schema.org/starRating'), rdflib.term.Literal('0.0', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('http://kg-course.io/

In [14]:
# Display as Turtle
#print(g.serialize(format="turtle"))
# Save the graph to a Turtle file
g.serialize(destination="knowledge_graph.ttl", format="turtle")

print("Knowledge graph saved as knowledge_graph.ttl")

Knowledge graph saved as knowledge_graph.ttl


# Generate visualization

In [15]:
!pip install scipy




[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import rdflib
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph

In [17]:
# load graph
g = rdflib.Graph()
g.parse('knowledge_graph.ttl', format='ttl')

<Graph identifier=Ncd7296c0a84b4e7abfa9e1e41a473649 (<class 'rdflib.graph.Graph'>)>

In [18]:
mdg = rdflib_to_networkx_multidigraph(g)
dg = rdflib_to_networkx_digraph(g)

In [20]:
mdg.number_of_nodes()
#g.order()

65230

In [22]:
mdg.number_of_edges() # also 
#g.size()

165214

In [23]:
#@title Graph visualization

import networkx as nx
import matplotlib.pyplot as plt

def visualize_small_graph(g):

    # Convert RDF graph to NetworkX graph
    nx_graph = nx.Graph()

    for s, p, o in g:
        nx_graph.add_node(s)
        nx_graph.add_node(o)
        if p.find('#') >= 0:
            label_=str(p.split('#')[-1])
        else:
            label_=str(p.split('/')[-1])
        nx_graph.add_edge(s, o, label=label_)

    # Visualization using Matplotlib
    pos = nx.spring_layout(nx_graph)

    labels = dict({})
    for node in nx_graph.nodes():
        if node.find('#') >= 0:
            labels[node] = str(node.split('#')[-1])
        else:
            labels[node] =  str(node.split('/')[-1])

    edge_labels = {(edge[0], edge[1]): edge[2]['label'] for edge in nx_graph.edges(data=True)}

    nx.draw(nx_graph, pos, with_labels=True, labels=labels, node_size=700, node_color='skyblue', font_size=8)
    nx.draw_networkx_edge_labels(nx_graph, pos, edge_labels=edge_labels, font_color='red')

    # Show the plot
    plt.show()



In [None]:
visualize_small_graph(g)