# PyNeoInstance Ingestion Demo

Data sourced from: https://www.kaggle.com/datasets/mexwell/countries-states-and-cities-around-the-world

In [10]:
import pandas as pd
from pyneoinstance import Neo4jInstance, load_yaml_file


## Prepare Data

### Assign File Locations

In [11]:
cities = "data/cities.csv"
countries = "data/countries.csv"
states = "data/states.csv"

### Load Into DataFrames

In [12]:
cities_df = pd.read_csv(cities)
countries_df = pd.read_csv(countries)
states_df = pd.read_csv(states)

## Load Configuration

In [13]:
config = load_yaml_file("config.yaml")
db_info = config['db_info']
constraints = config['initializing_queries']['constraints']
indexes = config['initializing_queries']['indexes']
node_load_queries = config['loading_queries']['nodes']
relationship_load_queries = config['loading_queries']['relationships']

## Load Data Into Graph

### Connect to Graph

In [14]:
graph = Neo4jInstance(db_info['uri'], db_info['user'], db_info['password'])

### Create Constraints & Indexes

In [15]:
try:
    graph.execute_write_queries(database=db_info['database'], queries=list(constraints.values()))
except Exception as e:
    print(e)




### Load Nodes

In [16]:
def get_partition(data: pd.DataFrame, batch_size: int = 500) -> int:
    """
    Determine the data partition based on the desired batch size.
    """
    
    partition = int(len(data) / batch_size)
    print("partition: "+str(partition if partition > 1 else 1))
    return partition if partition > 1 else 1

In [17]:
for data, query in list(zip([cities_df, countries_df, states_df], ['cities', 'countries', 'states'])):
    res = graph.execute_write_query_with_data(database=db_info['database'], 
                                                data=data, 
                                                query=node_load_queries[query], 
                                                partitions=get_partition(data, batch_size=500),
                                                parallel=True,
                                                workers=2)
    print(res)

partition: 300


  return bound(*args, **kwds)


{'properties_set': 150454}
partition: 1


  return bound(*args, **kwds)


{'properties_set': 500}
partition: 10


  return bound(*args, **kwds)


{'properties_set': 5077}


### Load Relationships

In [18]:
for data, query in list(zip([cities_df, countries_df, states_df], ['cities', 'countries', 'states'])):
    res = graph.execute_write_query_with_data(  database=db_info['database'], 
                                                data=data, 
                                                query=relationship_load_queries[query], 
                                                partitions=get_partition(data, batch_size=500))
    print(res)

partition: 300


  return bound(*args, **kwds)


{}
partition: 1
{}
partition: 10


  return bound(*args, **kwds)
  return bound(*args, **kwds)


{'relationships_created': 5077}
