# Demo CSV --> Graph Notebook

This notebooks demonstrates the data flow of generating a graph from a CSV file. 

In [1]:
import os

import json
import pandas as pd
import regex as re

from summarizer.summarizer import Summarizer
from llm.llm import LLM

## Initialize Test Data

In [2]:
USER_GENERATED_INPUT = {
    'General Description': 'The data in my .csv file contains information about financial loans made to businesses.',
    'BorrowerName': 'BorrowerName contains the name of the Business that applied for the loan.',
    'BusinessType': 'BusinessType contains the type of business (i.e., Corp, Partnership, LLC, etc.)',
    'LoanNumber': 'LoanNumber contains the unique identifier for the loan.',
    'CurrentApprovalAmount': 'CurrentApprovalAmount contains the financial amount of the loan.',
    'JobsReported': 'JobsReported contains the number of jobs the loan supports.',
    'ProjectState': 'ProjectState contains the state where the funds will be used.',
    'OriginatingLender': 'OriginatingLender contains the lender that originated the loan.',
    'UTILITIES_PROCEED': 'UTILITIES_PROCEED contains the amount of the loan the borrower said they will use to pay utilities.',
    'PAYROLL_PROCEED': 'PAYROLL_PROCEED contains the amount of the loan the borrower said they will use for payroll.',
    'MORTGAGE_INTEREST_PROCEED': 'MORTGAGE_INTEREST_PROCEED contains the amount of the loan the borrower said they will use to pay mortgage interest.',
    'RENT_PROCEED': 'RENT_PROCEED contains the amount of the loan the borrower said they will use to pay rent.',
    'REFINANCE_EIDL_PROCEED': 'REFINANCE_EIDL_PROCEED contains the amount of the loan the borrower said they will use to refinance an existing loan.',
    'HEALTH_CARE_PROCEED': 'HEALTH_CARE_PROCEED contains the amount of the loan the borrower said they will use to pay employee health care.',
    'DEBT_INTEREST_PROCEED': 'DEBT_INTEREST_PROCEED contains the amount of the loan the borrower said they will use to pay debt interest.'
}

In [3]:
data = pd.read_csv("data/csv/ppp_loan_data.csv")

## Initialize LLM

In [4]:
llm = LLM()

In [5]:
summarizer = Summarizer(llm=llm, user_input=USER_GENERATED_INPUT, data=data)

In [6]:
discovery = summarizer.run_discovery()
print(discovery)

Based on the provided information about the financial loans data, here are some key points from the preliminary analysis:

1. **Data Size and Structure**:
   - The dataset contains 968,525 entries and 14 columns.
   - The columns consist of a mix of data types: object (4 columns), int64 (1 column), and float64 (9 columns).
   - There are missing values in several columns, with varying degrees of completeness.

2. **Important Features**:
   - **LoanNumber**: This column serves as a unique identifier for each loan and can be crucial for linking different entities in a graph model.
   - **CurrentApprovalAmount**: The financial amount of the loan is a key feature that can provide insights into the scale of the loans.
   - **JobsReported**: The number of jobs supported by each loan can be a critical metric for assessing the impact of the loans on employment.
   - **ProjectState**: Knowing the state where the funds will be used can be important for geographical analysis.
   - **OriginatingLe

In [8]:
initial_model = summarizer.create_initial_model()
print(initial_model)

nodes=[Node(label='Loan', properties=['LoanNumber', 'CurrentApprovalAmount', 'JobsReported'], unique_constraints=[]), Node(label='Business', properties=['BorrowerName', 'BusinessType'], unique_constraints=[]), Node(label='State', properties=['ProjectState'], unique_constraints=[]), Node(label='Lender', properties=['OriginatingLender'], unique_constraints=[]), Node(label='LoanUsage', properties=['UTILITIES_PROCEED', 'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED', 'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED', 'DEBT_INTEREST_PROCEED'], unique_constraints=[])] relationships=[Relationship(type='HAS_LOAN', properties=[], unique_constraints=[], source='Business', target='Loan'), Relationship(type='LOCATED_IN', properties=[], unique_constraints=[], source='Business', target='State'), Relationship(type='ORIGINATED_BY', properties=[], unique_constraints=[], source='Loan', target='Lender'), Relationship(type='INTENDED_FOR', properties=[], unique_constraints=[], source='Loan', t

In [9]:
initial_model.dict

{'nodes': [{'label': 'Loan',
   'properties': ['LoanNumber', 'CurrentApprovalAmount', 'JobsReported'],
   'unique_constraints': []},
  {'label': 'Business',
   'properties': ['BorrowerName', 'BusinessType'],
   'unique_constraints': []},
  {'label': 'State', 'properties': ['ProjectState'], 'unique_constraints': []},
  {'label': 'Lender',
   'properties': ['OriginatingLender'],
   'unique_constraints': []},
  {'label': 'LoanUsage',
   'properties': ['UTILITIES_PROCEED',
    'PAYROLL_PROCEED',
    'MORTGAGE_INTEREST_PROCEED',
    'RENT_PROCEED',
    'REFINANCE_EIDL_PROCEED',
    'HEALTH_CARE_PROCEED',
    'DEBT_INTEREST_PROCEED'],
   'unique_constraints': []}],
 'relationships': [{'type': 'HAS_LOAN',
   'properties': [],
   'unique_constraints': [],
   'source': 'Business',
   'target': 'Loan'},
  {'type': 'LOCATED_IN',
   'properties': [],
   'unique_constraints': [],
   'source': 'Business',
   'target': 'State'},
  {'type': 'ORIGINATED_BY',
   'properties': [],
   'unique_constraints'

In [10]:
summarizer.current_model

{'nodes': [{'label': 'Loan',
   'properties': ['LoanNumber', 'CurrentApprovalAmount', 'JobsReported'],
   'unique_constraints': []},
  {'label': 'Business',
   'properties': ['BorrowerName', 'BusinessType'],
   'unique_constraints': []},
  {'label': 'State', 'properties': ['ProjectState'], 'unique_constraints': []},
  {'label': 'Lender',
   'properties': ['OriginatingLender'],
   'unique_constraints': []},
  {'label': 'LoanUsage',
   'properties': ['UTILITIES_PROCEED',
    'PAYROLL_PROCEED',
    'MORTGAGE_INTEREST_PROCEED',
    'RENT_PROCEED',
    'REFINANCE_EIDL_PROCEED',
    'HEALTH_CARE_PROCEED',
    'DEBT_INTEREST_PROCEED'],
   'unique_constraints': []}],
 'relationships': [{'type': 'HAS_LOAN',
   'properties': [],
   'unique_constraints': [],
   'source': 'Business',
   'target': 'Loan'},
  {'type': 'LOCATED_IN',
   'properties': [],
   'unique_constraints': [],
   'source': 'Business',
   'target': 'State'},
  {'type': 'ORIGINATED_BY',
   'properties': [],
   'unique_constraints'

In [11]:
summarizer.iterate_model(iterations=1)

DataModel(nodes=[Node(label='Loan', properties=['LoanNumber', 'CurrentApprovalAmount', 'JobsReported'], unique_constraints=[]), Node(label='Business', properties=['BorrowerName', 'BusinessType'], unique_constraints=[]), Node(label='State', properties=['ProjectState'], unique_constraints=[]), Node(label='Lender', properties=['OriginatingLender'], unique_constraints=[]), Node(label='LoanUsage', properties=['UTILITIES_PROCEED', 'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED', 'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED', 'DEBT_INTEREST_PROCEED'], unique_constraints=[])], relationships=[Relationship(type='HAS_LOAN', properties=[], unique_constraints=[], source='Business', target='Loan'), Relationship(type='LOCATED_IN', properties=[], unique_constraints=[], source='Business', target='State'), Relationship(type='ORIGINATED_BY', properties=[], unique_constraints=[], source='Loan', target='Lender'), Relationship(type='INTENDED_FOR', properties=[], unique_constraints=[], sourc

In [12]:
summarizer.current_model

{'nodes': [{'label': 'Loan',
   'properties': ['LoanNumber', 'CurrentApprovalAmount', 'JobsReported'],
   'unique_constraints': []},
  {'label': 'Business',
   'properties': ['BorrowerName', 'BusinessType'],
   'unique_constraints': []},
  {'label': 'State', 'properties': ['ProjectState'], 'unique_constraints': []},
  {'label': 'Lender',
   'properties': ['OriginatingLender'],
   'unique_constraints': []},
  {'label': 'LoanUsage',
   'properties': ['UTILITIES_PROCEED',
    'PAYROLL_PROCEED',
    'MORTGAGE_INTEREST_PROCEED',
    'RENT_PROCEED',
    'REFINANCE_EIDL_PROCEED',
    'HEALTH_CARE_PROCEED',
    'DEBT_INTEREST_PROCEED'],
   'unique_constraints': []}],
 'relationships': [{'type': 'HAS_LOAN',
   'properties': [],
   'unique_constraints': [],
   'source': 'Business',
   'target': 'Loan'},
  {'type': 'LOCATED_IN',
   'properties': [],
   'unique_constraints': [],
   'source': 'Business',
   'target': 'State'},
  {'type': 'ORIGINATED_BY',
   'properties': [],
   'unique_constraints'