## Team code for loading and pickling data

Data source: 
Physician & Other Supplier Payments   
https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Physician-and-Other-Supplier2017  

### Reading in 1 dataset
- Bringing only specific columns after initial EDA, and lookin at stated business requirements (from HCBB presentation)
- Adding year column to each year's df

#### Columns to keep
National Provider Identifier
Last Name/Organization Name of the Provider
Entity Type of the Provider
City of the Provider
Zip Code of the Provider
State Code of the Provider
Provider Type
Place of Service 
HCPCS Code
HCPCS Description
Number of Services 
Number of Medicare Beneficiaries 
Number of Distinct Medicare Beneficiary/Per Day Services 
Average Medicare Allowed Amount 
ADD: Year (in each df on import)  

In [1]:
import pandas as pd
import pickle

### Tried importing via chunking and that took longer than NOT chunking. See bottom of notebook for code used for chunking experiment

In [2]:
%%time
# This step loads in only the columns we want, 
# adds a column for year, and 
# converts column headers to have no spaces or special characters
# This is for 2017. Years 2016 and 2015 are below.

cols = ['National Provider Identifier',
        'Last Name/Organization Name of the Provider',
        'Entity Type of the Provider',
        'City of the Provider',
        'Zip Code of the Provider',
        'State Code of the Provider',
        'Provider Type',
        'Place of Service',
        'HCPCS Code',
        'HCPCS Description',
        'Number of Services',
        'Number of Medicare Beneficiaries',
        'Number of Distinct Medicare Beneficiary/Per Day Services',
        'Average Medicare Allowed Amount']

df_payments_2017 = pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
                               usecols = cols)
df_payments_2017['year'] = 2017
df_payments_2017.columns = df_payments_2017.columns.str.replace(" ", "_").str.replace("/", "_").str.lower()
df_payments_2017.head()



Wall time: 24.5 s


Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,100.0,96,100,73.3988,2017
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99218,Hospital observation care typically 30 minutes,26.0,25,26,100.08,2017
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,52.0,51,52,136.38,2017
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99220,Hospital observation care typically 70 minutes...,59.0,59,59,190.363729,2017
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",16.0,16,16,101.68,2017


In [3]:
%%time

cols = ['National Provider Identifier',
        'Last Name/Organization Name of the Provider',
        'Entity Type of the Provider',
        'City of the Provider',
        'Zip Code of the Provider',
        'State Code of the Provider',
        'Provider Type',
        'Place of Service',
        'HCPCS Code',
        'HCPCS Description',
        'Number of Services',
        'Number of Medicare Beneficiaries',
        'Number of Distinct Medicare Beneficiary/Per Day Services',
        'Average Medicare Allowed Amount']

df_payments_2016 = pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2016.csv', 
                               usecols = cols)
df_payments_2016['year'] = 2016
df_payments_2016.columns = df_payments_2016.columns.str.replace(" ", "_").str.replace("/", "_").str.lower()
df_payments_2016.head()

Wall time: 29.7 s


Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,57.0,55,57,72.743158,2016
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,38.0,38,38,135.01,2016
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99220,Hospital observation care typically 70 minutes...,23.0,23,23,189.239565,2016
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",20.0,20,20,100.75,2016
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",96.0,87,96,136.25,2016


In [4]:
%%time

cols = ['National Provider Identifier',
        'Last Name/Organization Name of the Provider',
        'Entity Type of the Provider',
        'City of the Provider',
        'Zip Code of the Provider',
        'State Code of the Provider',
        'Provider Type',
        'Place of Service',
        'HCPCS Code',
        'HCPCS Description',
        'Number of Services',
        'Number of Medicare Beneficiaries',
        'Number of Distinct Medicare Beneficiary/Per Day Services',
        'Average Medicare Allowed Amount']

df_payments_2015 = pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2015.csv', 
                               usecols = cols)
df_payments_2015['year'] = 2015
df_payments_2015.columns = df_payments_2015.columns.str.replace(" ", "_").str.replace("/", "_").str.lower()
df_payments_2015.head()


Wall time: 29.9 s


Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,23.0,23.0,23.0,72.68,2015
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,18.0,18.0,18.0,135.85,2015
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",59.0,58.0,59.0,101.365085,2015
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",132.0,130.0,132.0,139.010455,2015
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",220.0,215.0,220.0,205.185955,2015


In [5]:
df_payments_2017.to_pickle('..\data\pickled_files\payments_2017.pkl')

In [6]:
df_payments_2016.to_pickle('..\data\pickled_files\payments_2016.pkl')

In [7]:
df_payments_2015.to_pickle('..\data\pickled_files\payments_2015.pkl')

### Reference Only - Chunking experiments. Chunking took a little longer! Didn't use this method.

In [None]:

# DON'T RUN THIS CELL. REFERENCE ONLY 

%%time
#option 1
cols = ['National Provider Identifier',
        'Last Name/Organization Name of the Provider',
        'Entity Type of the Provider',
        'City of the Provider',
        'Zip Code of the Provider',
        'State Code of the Provider',
        'Provider Type',
        'Place of Service',
        'Number of Services',
        'Number of Medicare Beneficiaries',
        'Number of Distinct Medicare Beneficiary/Per Day Services',
        'Average Medicare Allowed Amount']

payment_rows =[]
for chunk in pd.read_csv('../data/1_medicare_data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
                         chunksize = 1000, usecols = cols):
    chunk['year'] = 2017
    payment_rows.append(chunk)
    
df_payments_2017 = pd.concat(payment_rows, ignore_index=True)

In [None]:

# DON'T RUN THIS CELL. REFERENCE ONLY 

%%time
#option 2

cols = ['National Provider Identifier',
        'Last Name/Organization Name of the Provider',
        'Entity Type of the Provider',
        'City of the Provider',
        'Zip Code of the Provider',
        'State Code of the Provider',
        'Provider Type',
        'Place of Service',
        'Number of Services',
        'Number of Medicare Beneficiaries',
        'Number of Distinct Medicare Beneficiary/Per Day Services',
        'Average Medicare Allowed Amount']

payment_rows =[]
for chunk in pd.read_csv('../data/1_medicare_data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
                         chunksize = 1000, usecols = cols):
    payment_rows.append(chunk)
    
df_payments_2017 = pd.concat(payment_rows, ignore_index=True)
df_payments_2017['year'] = 2017