# Demo CSV --> Graph Notebook

This notebooks demonstrates the data flow of generating a graph from a CSV file. 

In [1]:
import os

import json
import pandas as pd
import regex as re

from summarizer.summarizer import Summarizer
from llm.llm import LLM

## Initialize Test Data

In [2]:
USER_GENERATED_INPUT = {
    'General Description': 'This is data on different countries.',
    'id': 'unique id for a country.',
    'name': 'the country name.',
    'phone_code': 'country area code.',
    'capital': 'the capital of the country.',
    'currency_name': "name of the country's currency",
    'region': 'primary region of the country.',
    'subregion': 'subregion location of the country.',
    'timezones': 'timezones contained within the country borders.',
    'latitude': 'the latitude coordinate of the country center.',
    'longitude': 'the longitude coordinate of the country center.'
}

In [3]:
data = pd.read_csv("data/csv/countries.csv")

## Initialize LLM

In [4]:
llm = LLM()

In [5]:
summarizer = Summarizer(llm=llm, user_input=USER_GENERATED_INPUT, data=data)

In [6]:
discovery = summarizer.run_discovery()
print(discovery)

The dataset contains information about 250 different countries. Each country is uniquely identified by an 'id' and 'name'. The 'name' feature is a categorical variable with 250 unique values, each corresponding to a different country. 

The geographical location of each country is represented by 'latitude' and 'longitude', both of which are continuous variables. The mean latitude and longitude are approximately 16.4 and 13.5 respectively, but these values vary widely as indicated by their standard deviations. 

The 'phone_code' feature, which represents the country area code, has 235 unique values. This suggests that some countries share the same phone code. 

The 'capital' feature, which represents the capital of each country, has 244 unique values out of 245 non-null entries. This indicates that there is one capital city that is shared by two countries. 

The 'currency_name' feature, which represents the name of the country's currency, has 161 unique values. The most common currency 

In [7]:
initial_model = summarizer.create_initial_model()
print(initial_model)

recieved a valid response
nodes=[Node(label='Country', properties=[Property(name='id', type='int', csv_mapping='id', is_unique=True), Property(name='name', type='str', csv_mapping='name', is_unique=True), Property(name='latitude', type='float', csv_mapping='latitude', is_unique=False), Property(name='longitude', type='float', csv_mapping='longitude', is_unique=False)]), Node(label='Capital', properties=[Property(name='name', type='str', csv_mapping='capital', is_unique=True)]), Node(label='Currency', properties=[Property(name='name', type='str', csv_mapping='currency_name', is_unique=True)]), Node(label='Region', properties=[Property(name='name', type='str', csv_mapping='region', is_unique=True)]), Node(label='Subregion', properties=[Property(name='name', type='str', csv_mapping='subregion', is_unique=True)]), Node(label='Timezone', properties=[Property(name='name', type='str', csv_mapping='timezones', is_unique=True)])] relationships=[Relationship(type='HAS_CAPITAL', properties=[], so

In [8]:
summarizer.current_model

{'nodes': [{'label': 'Country',
   'properties': [Property(name='id', type='int', csv_mapping='id', is_unique=True),
    Property(name='name', type='str', csv_mapping='name', is_unique=True),
    Property(name='latitude', type='float', csv_mapping='latitude', is_unique=False),
    Property(name='longitude', type='float', csv_mapping='longitude', is_unique=False)]},
  {'label': 'Capital',
   'properties': [Property(name='name', type='str', csv_mapping='capital', is_unique=True)]},
  {'label': 'Currency',
   'properties': [Property(name='name', type='str', csv_mapping='currency_name', is_unique=True)]},
  {'label': 'Region',
   'properties': [Property(name='name', type='str', csv_mapping='region', is_unique=True)]},
  {'label': 'Subregion',
   'properties': [Property(name='name', type='str', csv_mapping='subregion', is_unique=True)]},
  {'label': 'Timezone',
   'properties': [Property(name='name', type='str', csv_mapping='timezones', is_unique=True)]}],
 'relationships': [{'type': 'HAS_C

In [9]:
summarizer.model_history[-1].visualize()

TypeError: can only concatenate str (not "Property") to str

In [10]:
print(summarizer.model_history[-1].validate_model(csv_columns=list(USER_GENERATED_INPUT.keys())[1:])['message'])




In [11]:
summarizer.iterate_model(iterations=1)

recieved a valid response


DataModel(nodes=[Node(label='Country', properties=['id', 'name', 'phone_code', 'capital', 'currency_name', 'region', 'subregion', 'timezones', 'latitude', 'longitude'], unique_constraints=['id']), Node(label='Region', properties=['name'], unique_constraints=['name']), Node(label='Subregion', properties=['name'], unique_constraints=['name']), Node(label='Timezone', properties=['name'], unique_constraints=[])], relationships=[Relationship(type='BELONGS_TO_REGION', properties=[], unique_constraints=[], source='Country', target='Region'), Relationship(type='BELONGS_TO_SUBREGION', properties=[], unique_constraints=[], source='Country', target='Subregion'), Relationship(type='HAS_TIMEZONE', properties=[], unique_constraints=[], source='Country', target='Timezone')])

In [12]:
summarizer.current_model

{'nodes': [{'label': 'Country',
   'properties': ['id',
    'name',
    'phone_code',
    'capital',
    'currency_name',
    'region',
    'subregion',
    'timezones',
    'latitude',
    'longitude'],
   'unique_constraints': ['id']},
  {'label': 'Region', 'properties': ['name'], 'unique_constraints': ['name']},
  {'label': 'Subregion',
   'properties': ['name'],
   'unique_constraints': ['name']},
  {'label': 'Timezone', 'properties': ['name'], 'unique_constraints': []}],
 'relationships': [{'type': 'BELONGS_TO_REGION',
   'properties': [],
   'unique_constraints': [],
   'source': 'Country',
   'target': 'Region'},
  {'type': 'BELONGS_TO_SUBREGION',
   'properties': [],
   'unique_constraints': [],
   'source': 'Country',
   'target': 'Subregion'},
  {'type': 'HAS_TIMEZONE',
   'properties': [],
   'unique_constraints': [],
   'source': 'Country',
   'target': 'Timezone'}]}