## Read in datasets for 2015, 2016, 2017 and pickle

 - keep specific columns as discussed in brainstorming
 - add column for year

Columns to keep

- National Provider Identifier

- Last Name/Organization Name of the Provider

- Entity Type of the Provider

- City of the Provider

- Zip Code of the Provider

- State Code of the Provider

- Provider Type

- Place of Service

- HCPCS Code

- HCPCS Description

- Number of Services

- Number of Medicare Beneficiaries

- Number of Distinct Medicare Beneficiary/Per Day Services

- Average Medicare Allowed Amount

Data links:

- https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Physician-and-Other-Supplier2017

- https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Physician-and-Other-Supplier2016

- https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Physician-and-Other-Supplier2015

In [2]:
import pandas as pd
import pickle
import numpy as np

## Read in full csv without chunk method

In [3]:
%%time
# read in without using chunk method - preferable since we're not filtering data on the way in
# keep only columns we want to use
# change column headers to change spaces and special characters 

cols = ['National Provider Identifier',
        'Last Name/Organization Name of the Provider',
        'Entity Type of the Provider',
        'City of the Provider',
        'Zip Code of the Provider',
        'State Code of the Provider',
        'Provider Type',
        'Place of Service', 
        'HCPCS Code', 
        'HCPCS Description', 
        'Number of Services',
        'Number of Medicare Beneficiaries',
        'Number of Distinct Medicare Beneficiary/Per Day Services',
        'Average Medicare Allowed Amount']

df_payments_2017 = pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
                               usecols = cols, low_memory = False)
df_payments_2017['year'] = 2017
df_payments_2017.columns = df_payments_2017.columns.str.replace(" ", "_").str.replace("/", "_").str.lower()
df_payments_2017.head()


CPU times: user 22.8 s, sys: 8.03 s, total: 30.9 s
Wall time: 34.7 s


Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,100.0,96,100,73.3988,2017
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99218,Hospital observation care typically 30 minutes,26.0,25,26,100.08,2017
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,52.0,51,52,136.38,2017
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99220,Hospital observation care typically 70 minutes...,59.0,59,59,190.363729,2017
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",16.0,16,16,101.68,2017


In [5]:
print(df_payments_2017.shape)
df_payments_2017.info(verbose = True, null_counts = True)

#null values in last_name

(9847443, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9847443 entries, 0 to 9847442
Data columns (total 15 columns):
 #   Column                                                    Non-Null Count    Dtype  
---  ------                                                    --------------    -----  
 0   national_provider_identifier                              9847443 non-null  int64  
 1   last_name_organization_name_of_the_provider               9847297 non-null  object 
 2   entity_type_of_the_provider                               9847443 non-null  object 
 3   city_of_the_provider                                      9847441 non-null  object 
 4   zip_code_of_the_provider                                  9847441 non-null  object 
 5   state_code_of_the_provider                                9847443 non-null  object 
 6   provider_type                                             9847443 non-null  object 
 7   place_of_service                                          9847443 n

In [6]:
df_payments_2017.to_pickle("../data/df_payments_2017.pkl")

In [7]:
df_payments_2017 = pd.read_pickle("../data/df_payments_2017.pkl")

In [8]:
df_payments_2017.info(verbose = True, null_counts = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9847443 entries, 0 to 9847442
Data columns (total 15 columns):
 #   Column                                                    Non-Null Count    Dtype  
---  ------                                                    --------------    -----  
 0   national_provider_identifier                              9847443 non-null  int64  
 1   last_name_organization_name_of_the_provider               9847297 non-null  object 
 2   entity_type_of_the_provider                               9847443 non-null  object 
 3   city_of_the_provider                                      9847441 non-null  object 
 4   zip_code_of_the_provider                                  9847441 non-null  object 
 5   state_code_of_the_provider                                9847443 non-null  object 
 6   provider_type                                             9847443 non-null  object 
 7   place_of_service                                          9847443 non-null  objec

# Read in, drop columns, rename columns, and pickle 2016 and 2015 csv files

In [9]:
%%time

df_payments_2016 = pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2016.csv', 
                               usecols = cols, low_memory = False)
df_payments_2016['year'] = 2016
df_payments_2016.columns = df_payments_2016.columns.str.replace(" ", "_").str.replace("/", "_").str.lower()
df_payments_2016.head()


CPU times: user 26.2 s, sys: 7.17 s, total: 33.3 s
Wall time: 35.5 s


Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,57.0,55,57,72.743158,2016
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,38.0,38,38,135.01,2016
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99220,Hospital observation care typically 70 minutes...,23.0,23,23,189.239565,2016
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",20.0,20,20,100.75,2016
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",96.0,87,96,136.25,2016


In [10]:
print(df_payments_2016.shape)
df_payments_2016.info(verbose = True, null_counts = True)

#null values in last_name (136), zip_code (2)

(9714896, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9714896 entries, 0 to 9714895
Data columns (total 15 columns):
 #   Column                                                    Non-Null Count    Dtype  
---  ------                                                    --------------    -----  
 0   national_provider_identifier                              9714896 non-null  int64  
 1   last_name_organization_name_of_the_provider               9714760 non-null  object 
 2   entity_type_of_the_provider                               9714896 non-null  object 
 3   city_of_the_provider                                      9714896 non-null  object 
 4   zip_code_of_the_provider                                  9714894 non-null  object 
 5   state_code_of_the_provider                                9714896 non-null  object 
 6   provider_type                                             9714896 non-null  object 
 7   place_of_service                                          9714896 n

In [11]:
df_payments_2016.to_pickle('../data/df_payments_2016.pkl')

In [12]:
df_payments_2016 = pd.read_pickle('../data/df_payments_2016.pkl')

In [13]:
df_payments_2016.info(verbose = True, null_counts = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9714896 entries, 0 to 9714895
Data columns (total 15 columns):
 #   Column                                                    Non-Null Count    Dtype  
---  ------                                                    --------------    -----  
 0   national_provider_identifier                              9714896 non-null  int64  
 1   last_name_organization_name_of_the_provider               9714760 non-null  object 
 2   entity_type_of_the_provider                               9714896 non-null  object 
 3   city_of_the_provider                                      9714896 non-null  object 
 4   zip_code_of_the_provider                                  9714894 non-null  object 
 5   state_code_of_the_provider                                9714896 non-null  object 
 6   provider_type                                             9714896 non-null  object 
 7   place_of_service                                          9714896 non-null  objec

In [14]:
%%time

df_payments_2015 = pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2015.csv', 
                               usecols = cols, low_memory = False)
df_payments_2015['year'] = 2015
df_payments_2015.columns = df_payments_2015.columns.str.replace(" ", "_").str.replace("/", "_").str.lower()
df_payments_2015.head()

CPU times: user 26.3 s, sys: 6.8 s, total: 33.1 s
Wall time: 35.1 s


Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,23.0,23.0,23.0,72.68,2015
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,18.0,18.0,18.0,135.85,2015
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",59.0,58.0,59.0,101.365085,2015
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",132.0,130.0,132.0,139.010455,2015
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",220.0,215.0,220.0,205.185955,2015


In [15]:
print(df_payments_2015.shape)
df_payments_2015.info(verbose = True, null_counts = True)

#null values in last_name (145), entity (1), city (4), 1 each in: zip_code, state, provider_type, place, 
# number_of_services, number_of_medicare_beneficiaries, number_of_distinct, average_medicare_allowed

(9497892, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9497892 entries, 0 to 9497891
Data columns (total 15 columns):
 #   Column                                                    Non-Null Count    Dtype  
---  ------                                                    --------------    -----  
 0   national_provider_identifier                              9497892 non-null  int64  
 1   last_name_organization_name_of_the_provider               9497747 non-null  object 
 2   entity_type_of_the_provider                               9497891 non-null  object 
 3   city_of_the_provider                                      9497888 non-null  object 
 4   zip_code_of_the_provider                                  9497891 non-null  object 
 5   state_code_of_the_provider                                9497891 non-null  object 
 6   provider_type                                             9497891 non-null  object 
 7   place_of_service                                          9497891 n

In [16]:
df_payments_2015.to_pickle('../data/df_payments_2015.pkl')

In [17]:
df_payments_2015 = pd.read_pickle('../data/df_payments_2015.pkl')

In [18]:
df_payments_2015.info(verbose = True, null_counts = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9497892 entries, 0 to 9497891
Data columns (total 15 columns):
 #   Column                                                    Non-Null Count    Dtype  
---  ------                                                    --------------    -----  
 0   national_provider_identifier                              9497892 non-null  int64  
 1   last_name_organization_name_of_the_provider               9497747 non-null  object 
 2   entity_type_of_the_provider                               9497891 non-null  object 
 3   city_of_the_provider                                      9497888 non-null  object 
 4   zip_code_of_the_provider                                  9497891 non-null  object 
 5   state_code_of_the_provider                                9497891 non-null  object 
 6   provider_type                                             9497891 non-null  object 
 7   place_of_service                                          9497891 non-null  objec

## Read in using chunks - ended up not using this method as it took longer than just reading in the csv files in one go.

In [None]:
#%%time

#add year column inside for loop

#cols = ['National Provider Identifier', 
#        'Last Name/Organization Name of the Provider', 
#        'Entity Type of the Provider', 
#        'City of the Provider', 
#        'Zip Code of the Provider', 
#        'State Code of the Provider', 
#        'Provider Type', 
#        'Place of Service', 
#        'Number of Services', 
#        'Number of Medicare Beneficiaries', 
#        'Number of Distinct Medicare Beneficiary/Per Day Services', 
#        'Average Medicare Allowed Amount']

#payment_rows = []
#for chunk in pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
#                         chunksize = 1000, usecols = cols):
#    chunk['year'] = 2017
#    payment_rows.append(chunk) 
                
#df_payments_2017 = pd.concat(payment_rows, ignore_index=True)

#CPU times: user 1min 10s, sys: 21 s, total: 1min 31s
#Wall time: 1min 32s


In [None]:
#%%time
#add year column outside for loop, more efficient

#cols = ['National Provider Identifier',
#        'Last Name/Organization Name of the Provider',
#        'Entity Type of the Provider',
#        'City of the Provider',
#        'Zip Code of the Provider',
#        'State Code of the Provider',
#        'Provider Type',
#        'Place of Service',
#        'Number of Services',
#        'Number of Medicare Beneficiaries',
#        'Number of Distinct Medicare Beneficiary/Per Day Services',
#        'Average Medicare Allowed Amount']

#payment_rows = []
#for chunk in pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
#                         chunksize = 1000, usecols = cols):
#    payment_rows.append(chunk)
    
#df_payments_2017 = pd.concat(payment_rows, ignore_index=True)
#df_payments_2017['year'] = 2017

#CPU times: user 1min 5s, sys: 23.7 s, total: 1min 29s
#Wall time: 1min 29s
