In [1]:
import pandas as pd
import numpy as np
import sqlite3

**Dataset Downloads**

- Hop Teaming data can be found at https://careset.com/docgraph-hop-teaming-dataset/.
- Download the NPPES Data Dissemination from https://download.cms.gov/nppes/NPI_Files.html.
- Download the taxonomy code to classification crosswalk from the National Uniform Claim Committee (https://www.nucc.org/index.php/code-sets-mainmenu-41/provider-taxonomy-mainmenu-40/csv-mainmenu-57).
- Download the Zip code to CBSA crosswalk from here: https://www.huduser.gov/portal/datasets/usps_crosswalk.html. 

**Goal: Create a SQL database for the four datasets.**

In [2]:
db_path = '../data/hop_teaming.sqlite'

chunksize = 1000000

**#1 hopteaming dataset**

**Remove "accidental" referrals in hopteaming DataFrame.** 

Filter for records where transaction_count is at least 50 and the average_day_wait is less than 50.

In [3]:
n = 1
total_rows = 0

with sqlite3.connect(db_path) as db:
    for chunk in pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize = chunksize):                
        chunk = chunk[(chunk['transaction_count'] >= 50) & (chunk['average_day_wait'] <= 50)]
        chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
        chunk.to_sql('hopteaming', db, if_exists = 'append', index = False) 
        print(str(n * chunksize) + " rows added")
        n += 1
        total_rows += len(chunk)
        
    db.execute('CREATE INDEX from_npi ON hopteaming(from_npi)')
    db.execute('CREATE INDEX to_npi ON hopteaming(to_npi)')

print("Done")
print("Total rows:", total_rows)


1000000 rows added
2000000 rows added
3000000 rows added
4000000 rows added
5000000 rows added
6000000 rows added
7000000 rows added
8000000 rows added
9000000 rows added
10000000 rows added
11000000 rows added
12000000 rows added
13000000 rows added
14000000 rows added
15000000 rows added
16000000 rows added
17000000 rows added
18000000 rows added
19000000 rows added
20000000 rows added
21000000 rows added
22000000 rows added
23000000 rows added
24000000 rows added
25000000 rows added
26000000 rows added
27000000 rows added
28000000 rows added
29000000 rows added
30000000 rows added
31000000 rows added
32000000 rows added
33000000 rows added
34000000 rows added
35000000 rows added
36000000 rows added
37000000 rows added
38000000 rows added
39000000 rows added
40000000 rows added
41000000 rows added
42000000 rows added
43000000 rows added
44000000 rows added
45000000 rows added
46000000 rows added
47000000 rows added
48000000 rows added
49000000 rows added
50000000 rows added
51000000 

**#2 Taxonomy Dataset**

In [4]:
n = 1
total_rows = 0

with sqlite3.connect(db_path) as db:
    for chunk in pd.read_csv('../data/nucc_taxonomy_240.csv', chunksize = chunksize): 
        chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
        chunk.to_sql('taxonomy', db, if_exists = 'append', index = False) 
        print(str(n * chunksize) + " rows added")
        total_rows += len(chunk)

print("Done")
print("Total rows:", total_rows)

1000000 rows added
Done
Total rows: 874


**#3 Zip_cbsa Dataset**

In [5]:
n = 1
total_rows = 0

with sqlite3.connect(db_path) as db:
    for chunk in pd.read_csv('../data/ZIP_CBSA_122023.csv', chunksize = chunksize): 
        chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
        chunk.to_sql('zip_cbsa', db, if_exists = 'append', index = False, dtype={'zip': str}) 
        print(str(n * chunksize) + " rows added")
        total_rows += len(chunk)

print("Done")
print("Total rows:", total_rows)


1000000 rows added
Done
Total rows: 47598


**#4 NPPES (NPI) Dataset**

**Select the relevant columns in NPPES dataset for this project.**

- 'NPI'
- Entity Type, indicated by the 'Entity Type Code' field:
    - 1 = Provider (doctors, nurses, etc.)
    - 2 = Facility (Hospitals, Urgent Care, Doctors Offices)
- Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
    - 'Provider Organization Name (Legal Business Name)'
    - 'Provider Last Name (Legal Name)'
    - 'Provider First Name'
    - 'Provider Middle Name'
    - 'Provider Name Prefix Text'
    - 'Provider Name Suffix Text'
    - 'Provider Credential Text'
- Address: Business Practice Location (not mailing), contained in the following fields:
    - 'Provider First Line Business Practice Location Address'
    - 'Provider Second Line Business Practice Location Address'
    - 'Provider Business Practice Location Address City Name'
    - 'Provider Business Practice Location Address State Name'
    - 'Provider Business Practice Location Address Postal Code'
- The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns.

**Retrieve the provider's taxonomy code from the npidata DataFrame.**

A provider can have up to 15 taxonomy codes, but we want the one which has Primary Switch = Y in the associated 'Healthcare Provider Primary Taxonomy Switch*' field. Note that this does not always occur in spot 1.

In [6]:
n = 1
total_rows = 0

with sqlite3.connect(db_path) as db:
    for chunk in pd.read_csv('../data/npidata_pfile_20050523-20240211.csv', 
                             low_memory = False, 
                             chunksize = chunksize): 
        # Create Taxonomy Code column
        chunk['Healthcare Provider Taxonomy Code'] = None
        
        # Fill the value in the column if Switch Code column has value Y
        for i in range(1, 16):  # For the 15 columns relating to taxonomy code
            code_col = f'Healthcare Provider Taxonomy Code_{i}'
            switch_col = f'Healthcare Provider Primary Taxonomy Switch_{i}'
            if switch_col in chunk.columns:
                chunk.loc[chunk[switch_col] == 'Y', 
                                   'Healthcare Provider Taxonomy Code'] = chunk.loc[chunk[switch_col] == 'Y', 
                                                                                             code_col]
        # Create a zip code column 
        chunk['Zip'] = pd.to_numeric(chunk['Provider Business Practice Location Address Postal Code'].str[:5], errors='coerce')
        
        # Select the specific columns
        chunk = chunk[['NPI',
                          'Entity Type Code',
                          'Provider Organization Name (Legal Business Name)',
                          'Provider Last Name (Legal Name)',
                          'Provider First Name',
                          'Provider Middle Name',
                          'Provider Name Prefix Text',
                          'Provider Name Suffix Text',
                          'Provider Credential Text',
                          'Provider First Line Business Practice Location Address',
                          'Provider Second Line Business Practice Location Address',
                          'Provider Business Practice Location Address City Name',
                          'Provider Business Practice Location Address Postal Code',
                          'Healthcare Provider Taxonomy Code',
                          'Zip'
                         ]]
        chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]

        chunk.to_sql('npi', db, if_exists = 'append', index = False) 
        print(str(n * chunksize) + " rows added")
        n += 1
        total_rows += len(chunk)
        

print("Done")
print("Total rows:", total_rows)

1000000 rows added
2000000 rows added
3000000 rows added
4000000 rows added
5000000 rows added
6000000 rows added
7000000 rows added
8000000 rows added
9000000 rows added


OperationalError: there is already a table named npi

In [9]:
db = sqlite3.connect('../data/hop_teaming.sqlite')
db.execute('CREATE INDEX IF NOT EXISTS npi_index ON npi(npi)')
db.execute('CREATE INDEX IF NOT EXISTS provider_organization_name_index ON npi("provider_organization_name_(legal_business_name)")')
db.execute('CREATE INDEX IF NOT EXISTS zip_index ON npi(Zip)')
db.close()