In [1]:
from dotenv import load_dotenv
import polars as pl
from tqdm import tqdm
from neo4j import GraphDatabase
from neo4j.exceptions import DatabaseError 
import os
import uuid

In [2]:
# Load environment variables
load_dotenv()

# Variables
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_URI = os.getenv("NEO4J_URI")

In [3]:
# Connect to Neo4j
driver = GraphDatabase.driver(uri = NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [4]:
# Import the dataframe form the processing pipeline
chunks_df = pl.read_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/chunks_df.parquet")

# Select only the columns needed
chunks_df = chunks_df[['post_url', 'post_title', 'series_number', 'blog_date', 'blog_title', 'chunk_text', 'entities']]

# Add a 'chunk_id' column to the DataFrame with UUIDs (if not already present)
if "chunk_id" not in chunks_df.columns:
    chunks_df = chunks_df.with_columns([
        pl.Series("chunk_id", [str(uuid.uuid4()) for _ in range(len(chunks_df))])
    ])
# Show the first five
chunks_df

post_url,post_title,series_number,blog_date,blog_title,chunk_text,entities,chunk_id
str,str,str,date,str,str,list[list[str]],str
"""https://www.acquired.fm/episod…","""Costco""","""Season 13, Episode 2""",2023-08-20,"""The Complete History & Strateg…","""I don't think I have ever been…","[[""CharlieMunger ?"", ""PER""], [""Ben"", ""PER""], … [""WarrenBuffett"", ""PER""]]","""b7d7697c-50e4-498b-8691-adcba1…"
"""https://www.acquired.fm/episod…","""Costco""","""Season 13, Episode 2""",2023-08-20,"""The Complete History & Strateg…","""But unlike Costco today, they …","[[""costco"", ""ORG""], [""Fedco"", ""ORG""], … [""Fedco"", ""ORG""]]","""b5079a0a-55ce-4ad7-90f2-044221…"
"""https://www.acquired.fm/episod…","""Costco""","""Season 13, Episode 2""",2023-08-20,"""The Complete History & Strateg…","""Jim started as a grocery bagge…","[[""Jim"", ""PER""], [""CraigJelinek"", ""PER""], … [""SamWalton"", ""PER""]]","""6b63cc91-dc07-424a-ad2e-7d1dac…"
"""https://www.acquired.fm/episod…","""Costco""","""Season 13, Episode 2""",2023-08-20,"""The Complete History & Strateg…","""And then they have the greates…","[[""SanDiego"", ""LOC""], [""SanDiegoCityCredit"", ""ORG""], … [""costco"", ""ORG""]]","""5d5a1307-0480-455f-bba4-160924…"
"""https://www.acquired.fm/episod…","""Costco""","""Season 13, Episode 2""",2023-08-20,"""The Complete History & Strateg…","""In 1982, they do ultimately li…","[[""NASDAQ"", ""MISC""], [""sol"", ""PER""], … [""PriceClub"", ""ORG""]]","""b107b462-a499-4a12-8061-2b0252…"
…,…,…,…,…,…,…,…
"""https://www.acquired.fm/episod…","""Special‚ An Acquirer's View in…","""Season 1, Episode 18""",2016-08-22,"""Related Episodes""","""This person who's been their r…","[[""ProfitFromThe"", ""MISC""], [""NBA"", ""MISC""]]","""0d2ff1e2-6b6e-429e-a9d7-09012b…"
"""https://www.acquired.fm/episod…","""Jet""","""Season 1, Episode 19""",2016-08-29,"""Related Episodes""","""I'm here at Adobe so feel free…","[[""Adobe"", ""ORG""], [""Barada@Adobe.com"", ""ORG""], … [""jet.com"", ""ORG""]]","""afaa536e-de79-4c5a-aa32-81700c…"
"""https://www.acquired.fm/episod…","""Jet""","""Season 1, Episode 19""",2016-08-29,"""Related Episodes""","""Well, that was what I was goin…","[[""jet"", ""ORG""], [""jet"", ""ORG""], … [""american"", ""MISC""]]","""3dd23c0e-fa1e-451d-a4e4-35f742…"
"""https://www.acquired.fm/episod…","""Jet""","""Season 1, Episode 19""",2016-08-29,"""Related Episodes""","""Boy, that doesn't sound like W…","[[""jet"", ""ORG""], [""Walmart"", ""ORG""], … [""Microsoft"", ""ORG""]]","""d72e49af-71e3-4043-a1e9-7c92cb…"


In [7]:
chunks_df.filter(pl.col("post_title") == "Airbnb")

post_url,post_title,series_number,blog_date,blog_title,chunk_text,entities,chunk_id
str,str,str,date,str,str,list[list[str]],str
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""Similarly for gifts, it's a li…","[[""lpshow"", ""ORG""], [""BenGilbert"", ""PER""], … [""DoorDash"", ""ORG""]]","""d502b25a-75ba-4563-9636-3a8720…"
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""But I remember looking at hote…","[[""WWDC"", ""MISC""], ["""", ""MISC""], … [""SanFrancisco"", ""LOC""]]","""e7a9672e-80da-47fd-aed5-415566…"
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""They're still trying to basica…","[[""cheerio"", ""MISC""], [""y"", ""ORG""], … [""JeffBezos"", ""PER""]]","""db9b5383-786b-4db1-aba6-29f812…"
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""For something that seemed like…","[[""Sequoia"", ""ORG""], [""Airbnb"", ""ORG""], … [""Facebook"", ""MISC""]]","""f7929e74-6f8c-4b9f-88c2-058c0a…"
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""Never heard of a startup winni…","[[""NobelPeacePrize ,"", ""MISC""], [""UpstartstoBrad"", ""MISC""], … [""Samwer"", ""PER""]]","""ddd51d74-b7fa-48fc-ac20-c84149…"
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""Which we've alluded to in our …","[[""ipo"", ""MISC""]]","""4cced6ff-0bb9-46c2-97c0-5e4786…"
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""We do have some more nuggets h…","[[""s1"", ""MISC""], [""bamboo"", ""ORG""], … [""BrianChesky"", ""PER""]]","""514239df-bb9b-47d5-9571-ec98fc…"
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""All markets are supply and dem…","[[""Airbnb"", ""ORG""], [""Airbnb"", ""ORG""], … [""Airbnb"", ""ORG""]]","""197199bd-5086-4312-b013-73d252…"
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""For consumers, it's very easy …","[[""uber"", ""MISC""], [""Airbnb"", ""MISC""], … [""Airbnb"", ""ORG""]]","""44d9486f-acc4-4b38-a6ab-a4c7b0…"


In [5]:
# Create the full-text index
with driver.session() as session:
    session.run(
        "CREATE FULLTEXT INDEX chunk_fulltext_index FOR (c:Chunk) ON EACH [c.text] OPTIONS { indexConfig: { `fulltext.analyzer`: 'standard' } };"
    )

In [6]:
# Prepare data for bulk upload with DISTINCT podcast nodes
podcast_nodes = []
chunk_nodes = []
belongs_to_rels = []
entity_nodes = []
mentions_rels = []

# Create sets to keep track of unique nodes and relationships
unique_podcast_nodes = set()
unique_chunk_nodes = set()
unique_belongs_to_rels = set()
unique_entity_nodes = set()
unique_mentions_rels = set()

for row in chunks_df.iter_rows(named=True):
    # --- Podcast Nodes ---
    
    # Create a key for the podcast based on its identifying attributes
    podcast_key = (row['post_url'], row['post_title'])  # Use a tuple of relevant attributes

    if podcast_key not in unique_podcast_nodes:
        podcast_node = {
            "podcast_id": str(uuid.uuid4()),
            "post_url": row['post_url'],
            "post_title": row['post_title'],
            "blog_date": row['blog_date'],
            "blog_title": row['blog_title'],
            "series_number": row['series_number']
        }
        podcast_nodes.append(podcast_node)
        unique_podcast_nodes.add(podcast_key)

    # --- Chunk Nodes ---
    chunk_node = {
        "chunk_id": row['chunk_id'],
        "text": row['chunk_text']
    }
    if tuple(chunk_node.items()) not in unique_chunk_nodes:
        chunk_nodes.append(chunk_node)
        unique_chunk_nodes.add(tuple(chunk_node.items()))

    # --- BELONGS_TO Relationships ---
    belongs_to_rel = {
        "chunk_id": row['chunk_id'],
        "podcast_id": podcast_node['podcast_id']  # Use the podcast_id from the podcast_node
    }
    if tuple(belongs_to_rel.items()) not in unique_belongs_to_rels:
        belongs_to_rels.append(belongs_to_rel)
        unique_belongs_to_rels.add(tuple(belongs_to_rel.items()))

    # --- Entity Nodes and MENTIONS Relationships ---
    for entity, label in row['entities']:
        entity_node = {
            "name": entity,
            "label": label
        }
        if tuple(entity_node.items()) not in unique_entity_nodes:
            entity_nodes.append(entity_node)
            unique_entity_nodes.add(tuple(entity_node.items()))

        mentions_rel = {
            "chunk_id": row['chunk_id'],
            "entity_name": entity,
            "entity_label": label
        }
        if tuple(mentions_rel.items()) not in unique_mentions_rels:
            mentions_rels.append(mentions_rel)
            unique_mentions_rels.add(tuple(mentions_rel.items()))

In [7]:
# Check the number of nodes and relationships
len(podcast_nodes), len(chunk_nodes), len(belongs_to_rels), len(entity_nodes), len(mentions_rels)

(199, 1304, 1304, 4579, 9365)

In [8]:
# Bulk upload nodes and relationships
with driver.session() as session:
    # --- Podcast Nodes ---
    for i in tqdm(range(0, len(podcast_nodes), 1000), desc="Creating Podcast Nodes"):
        batch = podcast_nodes[i:i + 1000]
        session.run(
            """
            UNWIND $podcast_nodes AS podcast
            MERGE (p:Podcast {podcast_id: podcast.podcast_id})
            SET p.post_url = podcast.post_url, 
                p.post_title = podcast.post_title,
                p.blog_date = podcast.blog_date, 
                p.blog_title = podcast.blog_title,
                p.series_number = podcast.series_number
            """,
            podcast_nodes=batch
        )

    # --- Chunk Nodes ---
    for i in tqdm(range(0, len(chunk_nodes), 1000), desc="Creating Chunk Nodes"):
        batch = chunk_nodes[i:i + 1000]
        session.run(
            """
            UNWIND $chunk_nodes AS chunk
            CREATE (c:Chunk {chunk_id: chunk.chunk_id, text: chunk.text})
            """,
            chunk_nodes=batch
        )

    # --- BELONGS_TO Relationships ---
    for i in tqdm(range(0, len(belongs_to_rels), 1000), desc="Creating BELONGS_TO Relationships"):
        batch = belongs_to_rels[i:i + 1000]
        session.run(
            """
            UNWIND $belongs_to_rels AS rel
            MATCH (c:Chunk {chunk_id: rel.chunk_id})
            MATCH (p:Podcast {podcast_id: rel.podcast_id})
            CREATE (c)-[:BELONGS_TO]->(p)
            """,
            belongs_to_rels=batch
        )

    # --- Entity Nodes and MENTIONS Relationships ---
    unique_entity_nodes = []
    for entity in entity_nodes:
        if entity not in unique_entity_nodes:
            unique_entity_nodes.append(entity)

    for i in tqdm(range(0, len(unique_entity_nodes), 1000), desc="Creating Entity Nodes"):
        batch = unique_entity_nodes[i:i + 1000]
        session.run(
            """
            UNWIND $entity_nodes AS entity
            MERGE (e:Entity {name: entity.name, label: entity.label})  
            """,
            entity_nodes=batch
        )

    # Create a list to store unique mentions relationships
    unique_mentions_rels = []
    for rel in mentions_rels:
        if rel not in unique_mentions_rels:
            unique_mentions_rels.append(rel)
    for i in tqdm(range(0, len(unique_mentions_rels), 1000), desc="Creating MENTIONS Relationships"):
        batch = unique_mentions_rels[i:i + 1000]
        try:
            session.run(
                """
                UNWIND $mentions_rels AS rel
                MATCH (c:Chunk {chunk_id: rel.chunk_id})
                MATCH (e:Entity {name: rel.entity_name, label: rel.entity_label})  
                CREATE (c)-[:MENTIONS]->(e)
                """,
                mentions_rels=batch
            )
        except DatabaseError as e:
            if e.code == DatabaseError.Transaction.TransactionCommitFailed:
                print(f"Error creating MENTIONS relationships (batch {i // 1000 + 1}): {e.message}")
                # Handle the error (e.g., log the error, skip the batch, retry with smaller batches)
            else:
                raise e  # Raise other types of errors

Creating Podcast Nodes: 100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
Creating Chunk Nodes: 100%|██████████| 2/2 [00:04<00:00,  2.30s/it]
Creating BELONGS_TO Relationships: 100%|██████████| 2/2 [00:03<00:00,  1.83s/it]
Creating Entity Nodes: 100%|██████████| 5/5 [00:09<00:00,  1.86s/it]
Creating MENTIONS Relationships: 100%|██████████| 10/10 [00:42<00:00,  4.26s/it]
