## Medicare data cleaning

In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from glob import glob

%matplotlib inline

# Read in pickle files for 2015, 2016, 2017 and combine

In [2]:
df_payments_2015 = pd.read_pickle('../data/df_payments_2015.pkl')
df_payments_2016 = pd.read_pickle('../data/df_payments_2016.pkl')
df_payments_2017 = pd.read_pickle("../data/df_payments_2017.pkl")

In [9]:
files = sorted(glob('../data/df_payments_*.pkl'))

df_payments_combined = pd.concat((pd.read_pickle(file)
          for file in files), ignore_index= True)

In [10]:
df_payments_combined.head()

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,23.0,23.0,23.0,72.68,2015
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,18.0,18.0,18.0,135.85,2015
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",59.0,58.0,59.0,101.365085,2015
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",132.0,130.0,132.0,139.010455,2015
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",220.0,215.0,220.0,205.185955,2015


In [11]:
df_payments_combined.shape

(29060231, 15)

We looked at the row that had no data except NPI and provider name in the 2015 data - it's copyright info but not actual data so we removed it from the data.

In [12]:
df_payments_combined[df_payments_combined.national_provider_identifier == 1]

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
7205022,1,CPT copyright 2014 American Medical Associatio...,,,,,,,,,,,,,2015


In [13]:
df_payments_combined = df_payments_combined.drop(labels = 7205022)

In [14]:
df_payments_combined[df_payments_combined.national_provider_identifier == 1]

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year


# Add column to indicate type of payment - doctor only, facility only, or doctor and facility combined

Entity Type
- I = Individual
- O = Organization

Place of Service
- O = Office
- F = Facility

Entity, Place = Description
- I, F = Doctor Only
- I, O = Doctor & Facility O, F = Facility Only
- O, O = Doctor & Facility

In [15]:
# Step 1 - add new column with payment type (Code from Deigo)

conditions = [
    (df_payments_combined.place_of_service == 'O'),
    (df_payments_combined.entity_type_of_the_provider == 'I') & 
    (df_payments_combined.place_of_service == 'F'),
    (df_payments_combined.entity_type_of_the_provider == 'O') & 
    (df_payments_combined.place_of_service == 'F')
]

choices = ['Doctor & Facility', 'Doctor Only', 'Facility Only']

In [16]:
# Step 2 - add new column with payment type (Code from Deigo)

df_payments_combined['payment_type'] = np.select(conditions, choices, default = 'unknown')

In [17]:
# confirm there are no "unknown" values

df_payments_combined['payment_type'].unique()

array(['Doctor Only', 'Doctor & Facility', 'Facility Only'], dtype=object)

In [18]:
# confirm that data looks right

df_payments_combined.tail()

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year,payment_type
29060226,1992999825,DESCHENES,I,SEATTLE,981012756,WA,Otolaryngology,O,99214,Established patient office or other outpatient...,248.0,175.0,248.0,116.86,2017,Doctor & Facility
29060227,1992999874,JOFFE,I,MECHANICSVILLE,231161844,VA,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",41.0,41.0,41.0,203.4,2017,Doctor Only
29060228,1992999874,JOFFE,I,MECHANICSVILLE,231161844,VA,Internal Medicine,F,99232,"Subsequent hospital inpatient care, typically ...",101.0,57.0,101.0,72.48,2017,Doctor Only
29060229,1992999874,JOFFE,I,MECHANICSVILLE,231161844,VA,Internal Medicine,F,99233,"Subsequent hospital inpatient care, typically ...",102.0,55.0,102.0,104.76,2017,Doctor Only
29060230,1992999874,JOFFE,I,MECHANICSVILLE,231161844,VA,Internal Medicine,F,99239,"Hospital discharge day management, more than 3...",49.0,49.0,49.0,107.98,2017,Doctor Only


In [20]:
df_payments_combined.to_pickle('../data/payments_combined.pkl')