# Demo CSV --> Graph Notebook

This notebooks demonstrates the data flow of generating a graph from a CSV file. 

In [1]:
import os

import json
import pandas as pd
import regex as re

from summarizer.summarizer import Summarizer
from llm.llm import LLM

## Initialize Test Data

In [2]:
USER_GENERATED_INPUT = {
    'General Description': 'The data in my .csv file contains information about financial loans made to businesses.',
    'BorrowerName': 'BorrowerName contains the name of the Business that applied for the loan.',
    'BusinessType': 'BusinessType contains the type of business (i.e., Corp, Partnership, LLC, etc.)',
    'LoanNumber': 'LoanNumber contains the unique identifier for the loan.',
    'CurrentApprovalAmount': 'CurrentApprovalAmount contains the financial amount of the loan.',
    'JobsReported': 'JobsReported contains the number of jobs the loan supports.',
    'ProjectState': 'ProjectState contains the state where the funds will be used.',
    'OriginatingLender': 'OriginatingLender contains the lender that originated the loan.',
    'UTILITIES_PROCEED': 'UTILITIES_PROCEED contains the amount of the loan the borrower said they will use to pay utilities.',
    'PAYROLL_PROCEED': 'PAYROLL_PROCEED contains the amount of the loan the borrower said they will use for payroll.',
    'MORTGAGE_INTEREST_PROCEED': 'MORTGAGE_INTEREST_PROCEED contains the amount of the loan the borrower said they will use to pay mortgage interest.',
    'RENT_PROCEED': 'RENT_PROCEED contains the amount of the loan the borrower said they will use to pay rent.',
    'REFINANCE_EIDL_PROCEED': 'REFINANCE_EIDL_PROCEED contains the amount of the loan the borrower said they will use to refinance an existing loan.',
    'HEALTH_CARE_PROCEED': 'HEALTH_CARE_PROCEED contains the amount of the loan the borrower said they will use to pay employee health care.',
    'DEBT_INTEREST_PROCEED': 'DEBT_INTEREST_PROCEED contains the amount of the loan the borrower said they will use to pay debt interest.'
}

In [3]:
data = pd.read_csv("data/csv/ppp_loan_data.csv")

## Initialize LLM

In [4]:
llm = LLM()

In [5]:
summarizer = Summarizer(llm=llm, user_input=USER_GENERATED_INPUT, data=data)

In [6]:
discovery = summarizer.run_discovery()
print(discovery)

Based on the preliminary analysis of the data, here are some important details:

1. The dataset contains 968,525 entries and 14 columns.
2. There are missing values in some columns, such as BorrowerName, BusinessType, JobsReported, ProjectState, UTILITIES_PROCEED, PAYROLL_PROCEED, MORTGAGE_INTEREST_PROCEED, RENT_PROCEED, REFINANCE_EIDL_PROCEED, HEALTH_CARE_PROCEED, and DEBT_INTEREST_PROCEED.
3. The LoanNumber column is a unique identifier for each loan, and it has a wide range of values.
4. The CurrentApprovalAmount column represents the financial amount of the loan, and it also has a wide range of values.
5. The JobsReported column indicates the number of jobs supported by each loan, with a mean of 51.89 and a maximum of 500.
6. The UTILITIES_PROCEED, PAYROLL_PROCEED, MORTGAGE_INTEREST_PROCEED, RENT_PROCEED, REFINANCE_EIDL_PROCEED, HEALTH_CARE_PROCEED, and DEBT_INTEREST_PROCEED columns represent the amount of the loan allocated for specific purposes.
7. The BorrowerName column contain

In [7]:
# print(discovery.choices[0].message.content)

In [8]:
initial_model = summarizer.create_initial_model()
print(initial_model)

nodes=[Node(label='Loan', properties=['LoanNumber', 'CurrentApprovalAmount'], unique_constraints=['LoanNumber']), Node(label='Business', properties=['BorrowerName', 'BusinessType'], unique_constraints=['BorrowerName']), Node(label='State', properties=['ProjectState'], unique_constraints=['ProjectState']), Node(label='Lender', properties=['OriginatingLender'], unique_constraints=['OriginatingLender'])] relationships=[Relationship(type='SUPPORTS', properties=['JobsReported'], unique_constraints=[], source='Loan', target='Business'), Relationship(type='LOCATED_IN', properties=[], unique_constraints=[], source='Loan', target='State'), Relationship(type='ORIGINATED_BY', properties=[], unique_constraints=[], source='Loan', target='Lender')]


In [11]:
initial_model.__dict__

{'nodes': [Node(label='Loan', properties=['LoanNumber', 'CurrentApprovalAmount'], unique_constraints=['LoanNumber']),
  Node(label='Business', properties=['BorrowerName', 'BusinessType'], unique_constraints=['BorrowerName']),
  Node(label='State', properties=['ProjectState'], unique_constraints=['ProjectState']),
  Node(label='Lender', properties=['OriginatingLender'], unique_constraints=['OriginatingLender'])],
 'relationships': [Relationship(type='SUPPORTS', properties=['JobsReported'], unique_constraints=[], source='Loan', target='Business'),
  Relationship(type='LOCATED_IN', properties=[], unique_constraints=[], source='Loan', target='State'),
  Relationship(type='ORIGINATED_BY', properties=[], unique_constraints=[], source='Loan', target='Lender')],
 '_raw_response': ChatCompletion(id='chatcmpl-8syPc38A59rhZL5iStMMDIOJo8Vyn', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, role='assistant', function_call=FunctionCall(argume

In [10]:
summarizer.current_model

DataModel(nodes=[Node(label='Loan', properties=['LoanNumber', 'CurrentApprovalAmount'], unique_constraints=['LoanNumber']), Node(label='Business', properties=['BorrowerName', 'BusinessType'], unique_constraints=['BorrowerName']), Node(label='State', properties=['ProjectState'], unique_constraints=['ProjectState']), Node(label='Lender', properties=['OriginatingLender'], unique_constraints=['OriginatingLender'])], relationships=[Relationship(type='SUPPORTS', properties=['JobsReported'], unique_constraints=[], source='Loan', target='Business'), Relationship(type='LOCATED_IN', properties=[], unique_constraints=[], source='Loan', target='State'), Relationship(type='ORIGINATED_BY', properties=[], unique_constraints=[], source='Loan', target='Lender')])

In [10]:
summarizer.iterate_model(iterations=1)

Validating response...


'```json\n{\n    "Nodes": [\n        {\n            "Label": "Loan",\n            "Properties": ["LoanNumber", "CurrentApprovalAmount", "JobsReported", "UTILITIES_PROCEED", "MORTGAGE_INTEREST_PROCEED", "RENT_PROCEED", "REFINANCE_EIDL_PROCEED", "HEALTH_CARE_PROCEED", "DEBT_INTEREST_PROCEED"],\n            "Unique Constraints": ["LoanNumber"],\n            "Reasoning": "The \'Loan\' node represents each unique loan in the dataset. The properties are all the features related to the loan itself. The \'LoanNumber\' is a unique identifier for each loan, so it is used as a unique constraint."\n        },\n        {\n            "Label": "Borrower",\n            "Properties": ["BorrowerName", "BusinessType"],\n            "Unique Constraints": ["BorrowerName"],\n            "Reasoning": "The \'Borrower\' node represents each unique borrower in the dataset. The properties are all the features related to the borrower. The \'BorrowerName\' is a unique identifier for each borrower, so it is used a

In [11]:
summarizer.current_model

{'Nodes': [{'Label': 'Loan',
   'Properties': ['LoanNumber',
    'CurrentApprovalAmount',
    'JobsReported',
    'UTILITIES_PROCEED',
    'MORTGAGE_INTEREST_PROCEED',
    'RENT_PROCEED',
    'REFINANCE_EIDL_PROCEED',
    'HEALTH_CARE_PROCEED',
    'DEBT_INTEREST_PROCEED'],
   'Unique Constraints': ['LoanNumber'],
   'Reasoning': "The 'Loan' node represents each unique loan in the dataset. The properties are all the features related to the loan itself. The 'LoanNumber' is a unique identifier for each loan, so it is used as a unique constraint."},
  {'Label': 'Borrower',
   'Properties': ['BorrowerName', 'BusinessType'],
   'Unique Constraints': ['BorrowerName'],
   'Reasoning': "The 'Borrower' node represents each unique borrower in the dataset. The properties are all the features related to the borrower. The 'BorrowerName' is a unique identifier for each borrower, so it is used as a unique constraint."},
  {'Label': 'Lender',
   'Properties': ['OriginatingLender'],
   'Unique Constrai