In [3]:
import pandas as pd
import sqlite3

In [1]:
database_path = '../data/hop_teaming'

In [4]:
chunksize = 1000000
n = 1

# The following code reads the CSV file in "chunks", processes each chunk and adds to the database
with sqlite3.connect(database_path) as db:
    for chunk in pd.read_csv('../data/DocGraph_Hop_Teaming_2018_Commercial/DocGraph_Hop_Teaming_2018.csv', chunksize=chunksize):
        chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]  # Clean up the column names
        chunk = chunk[(chunk.transaction_count >=50) & (chunk.average_day_wait <50)]
        chunk.to_sql('hopteaming', db, if_exists='append', index=False)  # Append the chunk to a hops table
        n += chunk.shape[0]
        print("Added " + str(chunk.shape[0]) + " rows. Total = "+ str(n))

Added 153332 rows
Added 155155 rows
Added 157854 rows
Added 157310 rows
Added 153192 rows
Added 150092 rows
Added 150721 rows
Added 155427 rows
Added 157028 rows
Added 150601 rows
Added 151017 rows
Added 159637 rows
Added 160506 rows
Added 150137 rows
Added 149767 rows
Added 163595 rows
Added 162169 rows
Added 163097 rows
Added 159830 rows
Added 153770 rows
Added 161355 rows
Added 160586 rows
Added 160145 rows
Added 158177 rows
Added 155225 rows
Added 156739 rows
Added 158768 rows
Added 159394 rows
Added 161332 rows
Added 157262 rows
Added 153534 rows
Added 162608 rows
Added 159421 rows
Added 155861 rows
Added 157002 rows
Added 157531 rows
Added 155274 rows
Added 156349 rows
Added 156270 rows
Added 155120 rows
Added 154790 rows
Added 158463 rows
Added 157091 rows
Added 157828 rows
Added 156131 rows
Added 153782 rows
Added 152566 rows
Added 153639 rows
Added 152816 rows
Added 155932 rows
Added 160923 rows
Added 159131 rows
Added 153004 rows
Added 155318 rows
Added 157505 rows
Added 1578

In [42]:
# This function finds the taxonomy code for a given row
def get_taxo_code(row):
    code_prefix = 'Healthcare Provider Taxonomy Code_'
    switch_prefix = 'Healthcare Provider Primary Taxonomy Switch_'
    for num in range(1,16):
        if row[switch_prefix+str(num)] == 'Y':
            break
    return row[code_prefix+str(num)]




In [31]:
# This section creates the npps table

chunksize = 1000000
n = 0

with sqlite3.connect(database_path) as db:
    for chunk in pd.read_csv('../data/NPPES_Data_Dissemination_February_2024/nppes_main.csv', dtype=str, chunksize=chunksize):
        taxonomy_cols = [col for col in chunk.columns if 'Taxonomy Code' in col or 'Taxonomy Switch' in col] #gets the column names which have "Taxonomy Code" or "Taxonomy Switch" in them

        chunk = chunk[['NPI', 'Entity Type Code', 'Provider Organization Name (Legal Business Name)', 'Provider Last Name (Legal Name)',
                        'Provider First Name', 'Provider Middle Name', 'Provider Name Prefix Text', 'Provider Name Suffix Text',
                        'Provider Credential Text', 'Provider First Line Business Practice Location Address', 'Provider Second Line Business Practice Location Address',
                        'Provider Business Practice Location Address City Name', 'Provider Business Practice Location Address State Name',
                        'Provider Business Practice Location Address Postal Code'] + taxonomy_cols] # selects the columns we want

        rename_dict = {'Provider Organization Name (Legal Business Name)': 'Provider Business Name', 
                    'Provider Last Name (Legal Name)': 'Provider Last Name',
                    'Provider Business Practice Location Address Postal Code': 'Provider Zip Code'
                    }
        df = chunk.copy()
        df = df.rename(rename_dict, axis=1)
        df["taxo_code"] = chunk.apply(get_taxo_code, axis=1)
        df = df.drop(columns = taxonomy_cols)
        df =df[df.NPI.notnull() & df.taxo_code.notnull() & df['Provider Zip Code'].notnull()]
        df['Provider Zip Code'] = df['Provider Zip Code'].apply(lambda x: str(x)[:5] if pd.notnull(x) else x)
        df.columns = [x.lower().replace(' ', '_').replace("(","").replace(")","") for x in df.columns] 
        df.to_sql('npi', db, if_exists='append', index=False)  # Append the chunk to an npps table
        n += df.shape[0]
        print("Added " + str(df.shape[0]) + " rows. Total = "+ str(n))


Added 914032 rows. Total = 914032
Added 903772 rows. Total = 1817804
Added 945915 rows. Total = 2763719
Added 968799 rows. Total = 3732518
Added 979097 rows. Total = 4711615
Added 987765 rows. Total = 5699380
Added 991930 rows. Total = 6691310
Added 995928 rows. Total = 7687238
Added 180369 rows. Total = 7867607


In [7]:
# This section creates the code_to_classification table
taxonomy_code_to_classification= pd.read_csv('../data/nucc_taxonomy_240.csv', dtype=str)
with sqlite3.connect(database_path) as db:
    taxonomy_code_to_classification.to_sql('taxonomy', db, if_exists='append', index=False) 

In [37]:
# This section creates the zip_to_cbsa table
chunksize = 10000
n = 0

with sqlite3.connect(database_path) as db:
    for df in pd.read_csv('../data/ZIP_CBSA_122023.csv', dtype=str, chunksize=chunksize):
        df = df.drop(columns=['RES_RATIO','BUS_RATIO', 'OTH_RATIO', 'TOT_RATIO'])
        df = df[df.ZIP.notnull() & df.CBSA.notnull() & (df.CBSA == '34980')]
        df.to_sql('zip_cbsa', db, if_exists='append', index=False)  # Append the chunk to the zip_to_cbsa table
        n += df.shape[0]
        print("Added " + str(df.shape[0]) + " rows. Total = "+ str(n))

Added 0 rows. Total = 0
Added 157 rows. Total = 157
Added 0 rows. Total = 157
Added 0 rows. Total = 157
Added 0 rows. Total = 157


In [9]:
# SCRATCH 

# hopteaming = pd.read_csv('../data/DocGraph_Hop_Teaming_2018_Commercial/DocGraph_Hop_Teaming_2018.csv', 
#                           nrows = 100)


# chunk = pd.read_csv('../data/NPPES_Data_Dissemination_February_2024/nppes_main.csv', dtype=str,
#                           nrows = 100)
# taxonomy_cols = [col for col in chunk.columns if 'Taxonomy Code' in col or 'Taxonomy Switch' in col]

# chunk = chunk[['NPI', 'Entity Type Code', 'Provider Organization Name (Legal Business Name)', 'Provider Last Name (Legal Name)', 
#              'Provider First Name', 'Provider Middle Name', 'Provider Name Prefix Text', 'Provider Name Suffix Text', 
#              'Provider Credential Text', 'Provider First Line Business Practice Location Address', 'Provider Second Line Business Practice Location Address',
#              'Provider Business Practice Location Address City Name', 'Provider Business Practice Location Address State Name', 
#              'Provider Business Practice Location Address Postal Code']+taxonomy_cols]
# df = chunk.copy()
# df["taxo_code"] = chunk.apply(get_taxo_code, axis=1)
# df = df.drop(columns = taxonomy_cols)
# df =df[df.NPI.notnull() & df.taxo_code.notnull()]
# df.columns = [x.lower().replace(' ', '_') for x in df.columns] 



# df = pd.read_csv('../data/DocGraph_Hop_Teaming_2018_Commercial/DocGraph_Hop_Teaming_2018.csv', nrows=100)
# # df = df[df.ZIP.notnull() & df.CBSA.notnull()]
# df[(df.transaction_count >=50) & (df.average_day_wait <50)]

In [33]:
db = sqlite3.connect(database_path)

db.execute('CREATE INDEX IF NOT EXISTS idx_npi ON npi(npi)')
# db.execute('CREATE INDEX IF NOT EXISTS idx_from_npi ON hopteaming(from_npi)')
# db.execute('CREATE INDEX IF NOT EXISTS idx_to_npi ON hopteaming(to_npi)')

db.close()

In [32]:
query = """
SELECT COUNT(*) AS null_count
FROM npi
WHERE provider_zip_code IS NULL;
"""

with sqlite3.connect(database_path) as db: 
    res = pd.read_sql(query, db)

res

Unnamed: 0,null_count
0,0


In [36]:
with sqlite3.connect(database_path) as db:
    # Drop the table
    db.execute('DROP TABLE IF EXISTS zip_cbsa')

In [41]:
query ="""
SELECT 
    ht.from_npi as referrer_npi, 
    ht.to_npi as hosp_npi, 
    ht.transaction_count AS txns,
    ht.patient_count AS patients,
    npi.provider_business_name AS hosp_business_name, 
    tx.Grouping AS hosp_grouping, 
    tx.Classification as hosp_classification,
    npi.provider_zip_code as hosp_zip_code,
    npi.provider_business_practice_location_address_city_name as hosp_city,
    npi.provider_business_practice_location_address_state_name as hosp_state
FROM 
    hopteaming AS ht
    INNER JOIN npi ON ht.to_npi = npi.npi
    INNER JOIN zip_cbsa ON npi.provider_zip_code = zip_cbsa.ZIP
    INNER JOIN taxonomy AS tx ON npi.taxo_code = tx.Code
WHERE 
    npi.entity_type_code = '2'
"""
with sqlite3.connect(database_path) as db:
    referrals = pd.read_sql(query, db)
    db.execute('DROP TABLE IF EXISTS nash_referrals')
    referrals.to_sql('nash_referrals', db, if_exists='append', index=False)
    db.execute('CREATE INDEX IF NOT EXISTS idx_referrer_npi ON nash_referrals(referrer_npi)')
    db.execute('CREATE INDEX IF NOT EXISTS idx_hosp_npi ON nash_referrals(hosp_npi)')