### Pitchbook Mental Health Classification: Ground Truth Data Set ###

In [12]:
import os
import copy
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from textwrap import dedent
import logging
import datetime
from pathlib import Path

# Ollama
import ollama
from pydantic import BaseModel

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import llmt
# print(f'Package version: {llmt.__version__}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# Data directory and files
date_str = datetime.date.today().strftime('%y%m%d')
data_dir = os.path.join(os.environ.get('HOME'), 'data')
log_dir = os.path.join(data_dir, 'log')
Path(log_dir).mkdir(exist_ok=True, parents=True)
print(f'Data: {data_dir}')
print(f'Logs:  {log_dir}')

# Log file
# Set up logger
log_file_name = f'ollama_log_{date_str}.log'
log_file = os.path.join(log_dir, log_file_name)
dtfmt = '%y%m%d-%H:%M'
logfmt = '%(asctime)s-%(name)s-%(levelname)s-%(message)s'

logging.basicConfig(filename=log_file,
                    filemode='w',
                    level=logging.INFO,
                    format=logfmt,
                    datefmt=dtfmt)

logger = logging.getLogger(name=__name__)
logger.setLevel(logging.CRITICAL)

Data: /app/data
Logs:  /app/data/log


In [6]:
# Create a connection to Ollama
client = ollama.Client(host='http://ollama:11434')
try:
    client = ollama.Client(host='http://ollama:11434')
    models = client.list().models
    if models:
        print("Available models:")
        for model in models:
            print(f"- {model['model']}")
    else:
        print("No models found.")
except Exception as e:
    print(f"Error: {e}")

Available models:
- llama3.2:latest
- llama3.2:1b


### Load the data ###

In [93]:
dset_dir = os.path.join(os.environ.get('HOME'), 'homedata', 'hcp')
xls_file_1 = 'inpatient-companies-classification-03.18.2024.xlsx'
xls_file_2 = 'inpatient-companies-classification-Additional negs-04.01.2025.xlsx'

# Create a new data frame with the cleaned-up data frame
output_file_name = f'hcp-classification-{date_str}.parquet'

df1 = pd.read_excel(os.path.join(dset_dir, xls_file_1))
df2 = pd.read_excel(os.path.join(dset_dir, xls_file_2))

print(df1.shape)
print(df2.shape)

(2025, 10)
(35, 10)


In [90]:
# Combined columns of interest
# Clean up
def clean_df(df):
    pb_cols = ['Companies', 'CompanyID', 'Description']
    lb_cols = ['inpatient_healthcare ', 'outpatient_healthcare', 'mental_health']
    cols = pb_cols
    cols.extend(lb_cols)
    df_labeled = copy.deepcopy(df).\
                    dropna(subset='mental_health').\
                    reset_index(drop=True)[cols]
    df_labeled = df_labeled.rename(columns={'Companies': 'name',
                                            'CompanyID': 'id',
                                            'Description': 'description',
                                            'inpatient_healthcare ': 'inpatient',
                                            'outpatient_healthcare': 'outpatient'})
    df_labeled = df_labeled.astype({'inpatient': int, 'outpatient': int, 'mental_health': int})
    return df_labeled

df1_labeled = clean_df(df=df1)
df2_labeled = clean_df(df=df2)

### Duplicate IDs ###

In [94]:
# The company IDs are unique
print(df1_labeled.shape)
print(len(df1_labeled['id'].unique()))
print(df2_labeled.shape)
print(len(df2_labeled['id'].unique()))
duplicate_id_list = []
df2_id_list = df2_labeled['id'].unique()
for id2 in df2_id_list:
    df1_id = df1_labeled.loc[df1_labeled['id'] == id2]
    if len(df1_id) > 0:
        duplicate_id_list.append(id2)
        df2_id = df2_labeled.loc[df2_labeled['id'] == id2]
        display(df1_id)
        display(df2_id)
        print()
        print()
        print()
# Are the duplicate rows are the same
d1 = df1_labeled.loc[df1_labeled['id'].isin(duplicate_id_list)].\
                sort_values(by='id', ascending=True).\
                reset_index(drop=True).\
                astype({'inpatient': int, 'outpatient': int, 'mental_health': int})
d2 = df2_labeled.loc[df2_labeled['id'].isin(duplicate_id_list)].\
                sort_values(by='id', ascending=True).\
                reset_index(drop=True)
print(d1.equals(d2))

(169, 6)
169
(35, 6)
35


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
1,Alima,310749-31,Operator of a non-governmental organization in...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
0,Alima,310749-31,Operator of a non-governmental organization in...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
2,Apothecare,162054-28,Provider of pharmacy services intended to prov...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
1,Apothecare,162054-28,Provider of pharmacy services intended to prov...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
5,Ascellus,114978-88,Operator of trauma prevention and treatment cl...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
2,Ascellus,114978-88,Operator of trauma prevention and treatment cl...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
14,Carisk Partners,130635-73,Operator of a risk transfer and care coordinat...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
3,Carisk Partners,130635-73,Operator of a risk transfer and care coordinat...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
33,HLTH,182180-08,Provider of events and large-scale forums orga...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
4,HLTH,182180-08,Provider of events and large-scale forums orga...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
36,Holobiome,124869-43,Operator of a biotechnology company intended t...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
5,Holobiome,124869-43,Operator of a biotechnology company intended t...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
51,OM Heals,615423-52,Operator of an energy medicine platform design...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
6,OM Heals,615423-52,Operator of an energy medicine platform design...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
68,SLR Healthcare ABL,12260-80,Provider of debt financing to healthcare servi...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
7,SLR Healthcare ABL,12260-80,Provider of debt financing to healthcare servi...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
75,Synchronous Health,182457-55,Developer of an artificial intelligence platfo...,0,2,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
8,Synchronous Health,182457-55,Developer of an artificial intelligence platfo...,0,2,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
77,The Medical Center of Southeast Texas,627258-25,Operator of a medical center based out of Port...,1,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
9,The Medical Center of Southeast Texas,627258-25,Operator of a medical center based out of Port...,1,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
78,Vheda Health,97840-81,Developer of an integrated healthcare platform...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
10,Vheda Health,97840-81,Developer of an integrated healthcare platform...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
90,Chicago House And Social Service Agency,135432-64,"Non-profit organization offering housing, heal...",0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
11,Chicago House And Social Service Agency,135432-64,"Non-profit organization offering housing, heal...",0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
91,Childrensharbor,253833-49,Provider of support and services to families w...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
12,Childrensharbor,253833-49,Provider of support and services to families w...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
100,Douglasvillegymnastics,245325-97,Provider of gymnastics and cheerleading instru...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
13,Douglasvillegymnastics,245325-97,Provider of gymnastics and cheerleading instru...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
101,Force Emotion,178419-79,Developer of an EDA sensor technology designed...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
14,Force Emotion,178419-79,Developer of an EDA sensor technology designed...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
104,Forward Edge (Commercial Products),222505-93,Provider of substance abuse program administra...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
15,Forward Edge (Commercial Products),222505-93,Provider of substance abuse program administra...,0,0,0







Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
105,Genomind,62322-22,Operator of a mental health biotechnology comp...,0,0,0


Unnamed: 0,name,id,description,inpatient,outpatient,mental_health
16,Genomind,62322-22,Operator of a mental health biotechnology comp...,0,0,0





True


In [86]:
# Let's combine the two data frames
