## Read in 1 dataset
- bring only specific columns to save time
-
- add year data

In [3]:
import pandas as pd
import pickle

#### Columns to keep

- National Provider Identifier -- int64
- Last Name/Organization Name of the Provider -- object
- Entity Type of the Provider -- object
- City of the Provider -- object
- Zip Code of the Provider -- int64
- State Code of the Provider -- object
- Provider Type -- object
- Place of Service -- object
- HCPCS Code -- object
- HCPCS Description -- object
- Number of Services -- float64
- Number of Medicare Beneficiaries -- int64
- Number of Distinct Medicare Beneficiary/Per Day Services -- int64
- Average Medicare Allowed Amount -- float64
ADD:  Year (in each df on import)


In [4]:
cols = ['National Provider Identifier',
        'Last Name/Organization Name of the Provider',
        'Entity Type of the Provider',
        'City of the Provider',
        'Zip Code of the Provider',
        'State Code of the Provider',
        'Provider Type',
        'Place of Service',
        'HCPCS Code',
        'HCPCS Description',
        'Number of Services',
        'Number of Medicare Beneficiaries',
        'Number of Distinct Medicare Beneficiary/Per Day Services',
        'Average Medicare Allowed Amount']

### code using chunks
want to compare processing time vs bringing in full data

In [5]:
%%time
#option 1

payment_rows =[]
for chunk in pd.read_csv('../data/1_medicare_data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
                         chunksize = 1000, usecols = cols):
    chunk['year'] = 2017
    payment_rows.append(chunk)
    
df_payments_2017 = pd.concat(payment_rows, ignore_index=True)

Wall time: 2min 28s


Chunking option 1 took 2 min 20 sec

In [6]:
%%time
#option 2

payment_rows =[]
for chunk in pd.read_csv('../data/1_medicare_data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
                         chunksize = 1000, usecols = cols):
    payment_rows.append(chunk)
    
df_payments_2017 = pd.concat(payment_rows, ignore_index=True)
df_payments_2017['year'] = 2017

Wall time: 2min 21s


chunking option 2 took 1 min 57 sec (and already had a big dataframe saved from option 1 which may have made it slower)

In [7]:
print(df_payments_2017.shape)
df_payments_2017.head()

(9847443, 15)


Unnamed: 0,National Provider Identifier,Last Name/Organization Name of the Provider,Entity Type of the Provider,City of the Provider,Zip Code of the Provider,State Code of the Provider,Provider Type,Place of Service,HCPCS Code,HCPCS Description,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,100.0,96,100,73.3988,2017
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99218,Hospital observation care typically 30 minutes,26.0,25,26,100.08,2017
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,52.0,51,52,136.38,2017
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99220,Hospital observation care typically 70 minutes...,59.0,59,59,190.363729,2017
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",16.0,16,16,101.68,2017


### Code without chunks

In [8]:
%%time

df_payments_2017 = pd.read_csv('../data/1_medicare_data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
                               usecols = cols)
df_payments_2017['year'] = 2017



Wall time: 30.7 s


In [9]:
print(df_payments_2017.shape)
df_payments_2017.head()

(9847443, 15)


Unnamed: 0,National Provider Identifier,Last Name/Organization Name of the Provider,Entity Type of the Provider,City of the Provider,Zip Code of the Provider,State Code of the Provider,Provider Type,Place of Service,HCPCS Code,HCPCS Description,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,100.0,96,100,73.3988,2017
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99218,Hospital observation care typically 30 minutes,26.0,25,26,100.08,2017
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,52.0,51,52,136.38,2017
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99220,Hospital observation care typically 70 minutes...,59.0,59,59,190.363729,2017
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",16.0,16,16,101.68,2017


code without chunks took 24.6 seconds!

In [10]:
#make all lowercase, replace spaces with _, replace '/' with _
df_payments_2017.columns = map(str.lower, df_payments_2017.columns)
df_payments_2017.columns = df_payments_2017.columns.str.replace(" ", "_")
df_payments_2017.columns = df_payments_2017.columns.str.replace("/", "_")

df_payments_2017.head()

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,100.0,96,100,73.3988,2017
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99218,Hospital observation care typically 30 minutes,26.0,25,26,100.08,2017
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,52.0,51,52,136.38,2017
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99220,Hospital observation care typically 70 minutes...,59.0,59,59,190.363729,2017
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",16.0,16,16,101.68,2017


In [11]:
df_payments_2017.to_pickle("../data/1_medicare_data/pickled_files/payments_2017.pkl")

#### Option 3 is the fastest, using that to bring in other years
#### 2016

In [12]:
%%time

df_payments_2016 = pd.read_csv('../data/1_medicare_data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2016.csv', 
                               usecols = cols)
df_payments_2016['year'] = 2016



Wall time: 32.7 s


In [13]:
print(df_payments_2016.shape)
df_payments_2016.head()

(9714896, 15)


Unnamed: 0,National Provider Identifier,Last Name/Organization Name of the Provider,Entity Type of the Provider,City of the Provider,Zip Code of the Provider,State Code of the Provider,Provider Type,Place of Service,HCPCS Code,HCPCS Description,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,57.0,55,57,72.743158,2016
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,38.0,38,38,135.01,2016
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99220,Hospital observation care typically 70 minutes...,23.0,23,23,189.239565,2016
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",20.0,20,20,100.75,2016
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",96.0,87,96,136.25,2016


In [14]:
#make all lowercase, replace spaces with _, replace '/' with _
df_payments_2016.columns = map(str.lower, df_payments_2016.columns)
df_payments_2016.columns = df_payments_2016.columns.str.replace(" ", "_")
df_payments_2016.columns = df_payments_2016.columns.str.replace("/", "_")

df_payments_2016.head()

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,57.0,55,57,72.743158,2016
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,38.0,38,38,135.01,2016
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99220,Hospital observation care typically 70 minutes...,23.0,23,23,189.239565,2016
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",20.0,20,20,100.75,2016
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",96.0,87,96,136.25,2016


In [15]:
df_payments_2016.to_pickle("../data/1_medicare_data/pickled_files/payments_2016.pkl")

#### 2015

In [16]:
%%time

df_payments_2015 = pd.read_csv('../data/1_medicare_data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2015.csv', 
                               usecols = cols)
df_payments_2015['year'] = 2015



Wall time: 33.5 s


In [17]:
print(df_payments_2015.shape)
df_payments_2015.head()

(9497892, 15)


Unnamed: 0,National Provider Identifier,Last Name/Organization Name of the Provider,Entity Type of the Provider,City of the Provider,Zip Code of the Provider,State Code of the Provider,Provider Type,Place of Service,HCPCS Code,HCPCS Description,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,23.0,23.0,23.0,72.68,2015
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,18.0,18.0,18.0,135.85,2015
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",59.0,58.0,59.0,101.365085,2015
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",132.0,130.0,132.0,139.010455,2015
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",220.0,215.0,220.0,205.185955,2015


In [18]:
#make all lowercase, replace spaces with _, replace '/' with _
df_payments_2015.columns = map(str.lower, df_payments_2015.columns)
df_payments_2015.columns = df_payments_2015.columns.str.replace(" ", "_")
df_payments_2015.columns = df_payments_2015.columns.str.replace("/", "_")

df_payments_2015.head()

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,23.0,23.0,23.0,72.68,2015
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,18.0,18.0,18.0,135.85,2015
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",59.0,58.0,59.0,101.365085,2015
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",132.0,130.0,132.0,139.010455,2015
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",220.0,215.0,220.0,205.185955,2015


In [19]:
df_payments_2015.dtypes

national_provider_identifier                                  int64
last_name_organization_name_of_the_provider                  object
entity_type_of_the_provider                                  object
city_of_the_provider                                         object
zip_code_of_the_provider                                     object
state_code_of_the_provider                                   object
provider_type                                                object
place_of_service                                             object
hcpcs_code                                                   object
hcpcs_description                                            object
number_of_services                                          float64
number_of_medicare_beneficiaries                            float64
number_of_distinct_medicare_beneficiary_per_day_services    float64
average_medicare_allowed_amount                             float64
year                                            

#### drop blank row in 2015 file
identified blank row in data.cms.gov file  
only found in this years data

In [20]:
df_payments_2015[df_payments_2015.national_provider_identifier == 1]

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
7205022,1,CPT copyright 2014 American Medical Associatio...,,,,,,,,,,,,,2015


In [21]:
df_payments_2015 = df_payments_2015.drop(labels = 7205022)

In [22]:
df_payments_2015.to_pickle("../data/1_medicare_data/pickled_files/payments_2015.pkl")

### Combine 3 years into one dataframe, pickle that

In [30]:
df_payments_combined = pd.concat([df_payments_2015, df_payments_2016, df_payments_2017], ignore_index= True)

In [31]:
df_payments_combined.head()

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,23.0,23.0,23.0,72.68,2015
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,18.0,18.0,18.0,135.85,2015
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",59.0,58.0,59.0,101.365085,2015
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",132.0,130.0,132.0,139.010455,2015
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",220.0,215.0,220.0,205.185955,2015


In [32]:
df_payments_combined.to_pickle("../data/1_medicare_data/pickled_files/payments_combined.pkl")