# Ingest

This notebook handles data formatting for ingestion, gets embeddings if required and ingests data into the Neo4j graph.

In [1]:
import sys
import os
import time
from typing import List, Dict
import json
from uuid import uuid4

import pandas as pd

from tools.chunker import Chunker
from tools.embedding import EmbeddingService
from n4j.communicator import GraphWriter

from utils.utils import batch_method, prepare_new_nodes, remove_filler_words


## Define variables

In [2]:
with open("resources/playlist_ids.json") as json_file:
    playlists = json.load(json_file)

In [3]:
list(playlists.keys())[0]

'classics'

In [4]:
embed = EmbeddingService(service_provider="openai")
dims = embed.dimensions

In [5]:
writer = GraphWriter()

driver created


In [6]:
# playlist = 'classics'
# playlist_id = playlists[playlist]

## Load Code

In [7]:
try:
    writer.create_constraints()
except Exception as e:
    print(e)
try:
    writer.create_indexes(vector_dimensions=96)
except Exception as e:
    print(e)

'indexConfig'


In [8]:
titles_to_load = ['worst to best', 'electronic']

In [9]:
def chunk_load_sequence(title: str, playlist_id: str) -> None:
    
    print("creating chunks...")
    chunker = Chunker()
    chunker._chunked_documents = []
    chunker.chunk_youtube_transcripts(playlist_title=title,
                                    cleaning_functions=[remove_filler_words])

    result = []
    failed_idx = None
    playlist_total = len(chunker.chunks_as_list)

    print(title+": "+str(playlist_total))
    print("grabbing embeddings...")

    for idx, bat in enumerate(batch_method(chunker.chunks_as_list, 20)):
        new_nodes = prepare_new_nodes(data=bat, playlist_id=playlist_id, embedding_service=embed)
        result+=new_nodes
        print("total percent: ", str(round(((20*idx)+1) / playlist_total, 4)*100)[:4], "%", " batch", idx+1, "                  ",  end="\r")

    print("loading to graph...\n")
    for idx, bat in enumerate(batch_method(result, 500)):
        writer.load_nodes(data=bat)
        print("total percent: ", str(round(((20*idx)+1) / playlist_total, 4)*100)[:4], "%", " batch", idx+1, "                  ", end="\r")

    print()


In [10]:
for title in titles_to_load:
    chunk_load_sequence(title=title, playlist_id=playlists[title])


electronic: 62518
grabbing embeddings...
loading to graph...9 %  batch 3126

total percent:  4.0 %  batch 1265
