In [15]:
import mysql.connector
from neo4j import GraphDatabase
from tqdm import tqdm
import re
import nltk
nltk.download("omw-1.4")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.corpus import stopwords, wordnet
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package omw-1.4 to /home/faculty/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/faculty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/faculty/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def clean_ingredient(ingredient):
    # Remove any text within parentheses
    cleaned_ingredient = re.sub(r'\([^)]*\)', '', ingredient)
    
    # Tokenize the string into words
    words = nltk.word_tokenize(cleaned_ingredient)

    # Filter out stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Use NLTK's POS tagger to identify and extract nouns
    nouns = [word for word, pos in nltk.pos_tag(filtered_words) if pos in ['NN', 'NNS']]

    # Check if the extracted noun is a food item in WordNet
    food_nouns = []
    for noun in nouns:
        synsets = wordnet.synsets(noun, pos=wordnet.NOUN)
        food_related = any("food" in ss.lexname() for ss in synsets)
        if food_related:
            food_nouns.append(noun)
    
    # Join the food nouns with spaces and return the result
    return ' '.join(food_nouns)

In [17]:
# Connect to the MySQL database
cnx = mysql.connector.connect(
    host='',
    user='',
    password='',
    database=''
)

In [18]:
# Neo4j connection
neo4j_connection = GraphDatabase.driver(
    uri="",
    auth=("neo4j", "")
)

In [19]:
def create_recipe_nodes(tx, recipes_data):
    query = """
        UNWIND $recipes_data AS data
        MERGE (r:Recipe {id: data.recipe_id})
        ON CREATE SET r.title = data.title, r.image_url = data.image_url, r.category = data.category, r.cuisine = data.cuisine, r.cook_time_minutes = data.cook_time_minutes, r.total_time_minutes = data.total_time_minutes, r.yields = data.yields, r.created_at = data.created_at, r.updated_at = data.updated_at
    """
    tx.run(query, recipes_data=recipes_data)

In [20]:
def create_ingredient_nodes_and_relationships(tx, recipe_ingredients_data):
    query = """
        UNWIND $recipe_ingredients_data AS data
        MATCH (r:Recipe {id: data.recipe_id})
        MERGE (i:Ingredient {id: data.ingredient_id}) ON CREATE SET i.name = data.ingredient_name, i:Food
        MERGE (r)-[ri:REQUIRES_INGREDIENT]->(i) ON CREATE SET ri.quantity = data.quantity, ri.unit = data.unit
    """
    tx.run(query, recipe_ingredients_data=recipe_ingredients_data)

In [21]:
# Create a Neo4j session
neo4j_session = neo4j_connection.session()

In [22]:
# Create a cursor to execute MySQL queries
cursor = cnx.cursor()

In [23]:
# Query to create recipe nodes
cursor.execute("SELECT id, title, image_url, category, cuisine, cook_time_minutes, total_time_minutes, yields, created_at, updated_at FROM recipe")
recipes = cursor.fetchall()

In [24]:
# Create a list of recipes for batch insertion
recipes_list = []
for recipe in tqdm(recipes, desc="Creating Recipe Nodes"):
    recipe_id, title, image_url, category, cuisine, cook_time_minutes, total_time_minutes, yields, created_at, updated_at = recipe
    recipes_list.append({"recipe_id": recipe_id, "title": title, "image_url": image_url, "category": category, "cuisine": cuisine, "cook_time_minutes": cook_time_minutes, "total_time_minutes": total_time_minutes, "yields": yields, "created_at": created_at, "updated_at": updated_at})

# Batch insert recipes
neo4j_session.run("""
    UNWIND $recipes_list AS recipe
    MERGE (r:Recipe {id: recipe.recipe_id})
    ON CREATE SET r += recipe
""", {"recipes_list": recipes_list})

Creating Recipe Nodes: 100%|██████████| 559/559 [00:00<00:00, 666084.07it/s]


<neo4j._sync.work.result.Result at 0x7f1b27e39190>

In [25]:
# Query to create ingredient nodes and relationships with recipe nodes
cursor.execute("SELECT recipe_id, ingredient_id, quantity, unit FROM recipe_ingredient")
recipe_ingredients = cursor.fetchall()

# Create a list of recipe_ingredients for batch insertion
recipe_ingredients_list = []
for recipe_ingredient in tqdm(recipe_ingredients, desc="Creating ingredients list"):
    recipe_id, ingredient_id, quantity, unit = recipe_ingredient
    cursor.execute("SELECT name FROM ingredient WHERE id = %s", (ingredient_id,))
    ingredient_name = cursor.fetchone()[0]
    
    # Clean the ingredient name using the clean_ingredient function
    cleaned_ingredient_name = clean_ingredient(ingredient_name)

    recipe_ingredients_list.append({"recipe_id": recipe_id, "ingredient_id": ingredient_id, "ingredient_name": cleaned_ingredient_name, "quantity": quantity, "unit": unit})

# Batch insert ingredients and relationships
neo4j_session.run("""
    UNWIND $recipe_ingredients_list AS ri
    MATCH (r:Recipe {id: ri.recipe_id})
    MERGE (i:Ingredient {id: ri.ingredient_id})
    ON CREATE SET i.name = ri.ingredient_name
    MERGE (r)-[rel:REQUIRES_INGREDIENT]->(i)
    ON CREATE SET rel.quantity = ri.quantity, rel.unit = ri.unit
""", {"recipe_ingredients_list": recipe_ingredients_list})


Creating ingredients list: 100%|██████████| 5783/5783 [09:37<00:00, 10.01it/s]


<neo4j._sync.work.result.Result at 0x7f1b27edce50>

In [26]:
# Query to create cuisine nodes and relationships with recipe nodes
cursor.execute("SELECT DISTINCT cuisine FROM recipe WHERE cuisine IS NOT NULL")
cuisines = cursor.fetchall()

# Create a list of cuisines for batch insertion
cuisines_list = []
for cuisine in tqdm(cuisines, desc="Creating Cuisine Nodes"):
    (cuisine_name,) = cuisine
    cuisines_list.append({"cuisine_name": cuisine_name})

# Batch insert cuisines
neo4j_session.run("""
    UNWIND $cuisines_list AS cuisine
    MERGE (c:Cuisine {name: cuisine.cuisine_name})
""", {"cuisines_list": cuisines_list})

# Update the recipe creation query to include the relationship with the cuisine nodes (exclude recipes with NULL cuisine)
neo4j_session.run("""
    UNWIND $recipes_list AS recipe
    MERGE (r:Recipe {id: recipe.recipe_id})
    ON CREATE SET r += recipe
    WITH r, recipe
    WHERE recipe.cuisine IS NOT NULL
    MATCH (c:Cuisine {name: recipe.cuisine})
    MERGE (r)-[:BELONGS_TO]->(c)
    WITH r, recipe
    WHERE recipe.category IS NOT NULL
    MATCH (cat:Category {name: recipe.category})
    MERGE (r)-[:HAS_CATEGORY]->(cat)
""", {"recipes_list": recipes_list})

Creating Cuisine Nodes: 100%|██████████| 7/7 [00:00<00:00, 132252.83it/s]


<neo4j._sync.work.result.Result at 0x7f1b27e5ce20>

In [27]:
# Query to create category nodes and relationships with recipe nodes
cursor.execute("SELECT DISTINCT category FROM recipe WHERE category IS NOT NULL")
categories = cursor.fetchall()

# Create a list of categories for batch insertion
categories_list = []
for category in tqdm(categories, desc="Creating Category Nodes"):
    (category_name,) = category
    categories_list.append({"category_name": category_name})

# Batch insert categories
neo4j_session.run("""
    UNWIND $categories_list AS category
    MERGE (c:Category {name: category.category_name})
""", {"categories_list": categories_list})

# Update the recipe creation query to include the relationship with the category nodes
neo4j_session.run("""
    UNWIND $recipes_list AS recipe
    MERGE (r:Recipe {id: recipe.recipe_id})
    ON CREATE SET r += recipe
    WITH r, recipe
    WHERE recipe.cuisine IS NOT NULL
    MATCH (c:Cuisine {name: recipe.cuisine})
    MERGE (r)-[:BELONGS_TO]->(c)
    WITH r, recipe
    WHERE recipe.category IS NOT NULL
    MATCH (cat:Category {name: recipe.category})
    MERGE (r)-[:HAS_CATEGORY]->(cat)
""", {"recipes_list": recipes_list})

Creating Category Nodes: 100%|██████████| 10/10 [00:00<00:00, 171196.08it/s]


<neo4j._sync.work.result.Result at 0x7f1b20eb34f0>

In [28]:
# Close the Neo4j session
neo4j_session.close()

# Close the MySQL cursor and connection
cursor.close()
cnx.close()