# EDA on Medicare data 2015-2017

In [27]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

# Read in Combined DataFrame

This combined df has already eliminted the one irrelevant row from 2015, and has added a column for payment_type (Doctor Only, Facility Only, Doctor & Facility).

In [5]:
df_payments_combined = pd.read_pickle('../data/payments_combined.pkl')
print(df_payments_combined.shape)
df_payments_combined.head()

(29060230, 16)


Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year,payment_type
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,23.0,23.0,23.0,72.68,2015,Doctor Only
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,18.0,18.0,18.0,135.85,2015,Doctor Only
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",59.0,58.0,59.0,101.365085,2015,Doctor Only
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",132.0,130.0,132.0,139.010455,2015,Doctor Only
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",220.0,215.0,220.0,205.185955,2015,Doctor Only


# EDA of combined data

In [6]:
df_payments_combined.shape

(29060230, 16)

In [7]:
df_payments_combined.columns

Index(['national_provider_identifier',
       'last_name_organization_name_of_the_provider',
       'entity_type_of_the_provider', 'city_of_the_provider',
       'zip_code_of_the_provider', 'state_code_of_the_provider',
       'provider_type', 'place_of_service', 'hcpcs_code', 'hcpcs_description',
       'number_of_services', 'number_of_medicare_beneficiaries',
       'number_of_distinct_medicare_beneficiary_per_day_services',
       'average_medicare_allowed_amount', 'year', 'payment_type'],
      dtype='object')

In [8]:
df_payments_combined.head()

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year,payment_type
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,23.0,23.0,23.0,72.68,2015,Doctor Only
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,18.0,18.0,18.0,135.85,2015,Doctor Only
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",59.0,58.0,59.0,101.365085,2015,Doctor Only
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",132.0,130.0,132.0,139.010455,2015,Doctor Only
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",220.0,215.0,220.0,205.185955,2015,Doctor Only


In [10]:
df_payments_combined.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29060230 entries, 0 to 29060230
Data columns (total 16 columns):
 #   Column                                                    Non-Null Count     Dtype  
---  ------                                                    --------------     -----  
 0   national_provider_identifier                              29060230 non-null  int64  
 1   last_name_organization_name_of_the_provider               29059803 non-null  object 
 2   entity_type_of_the_provider                               29060230 non-null  object 
 3   city_of_the_provider                                      29060225 non-null  object 
 4   zip_code_of_the_provider                                  29060226 non-null  object 
 5   state_code_of_the_provider                                29060230 non-null  object 
 6   provider_type                                             29060230 non-null  object 
 7   place_of_service                                          29060230 non

In [11]:
# look at null values

df_payments_combined.isnull().sum()

# 427 null values for provider name. When we looked at this in the data visualizer on the Medicare website,
# these rows had first names (a column we chose to drop from our dataset) and all other pertinent values.
# Since the NPI is unique to each provider, we determined that a null value in the provider name column was not
# an impediment to analysis.

national_provider_identifier                                  0
last_name_organization_name_of_the_provider                 427
entity_type_of_the_provider                                   0
city_of_the_provider                                          5
zip_code_of_the_provider                                      4
state_code_of_the_provider                                    0
provider_type                                                 0
place_of_service                                              0
hcpcs_code                                                    0
hcpcs_description                                             0
number_of_services                                            0
number_of_medicare_beneficiaries                              0
number_of_distinct_medicare_beneficiary_per_day_services      0
average_medicare_allowed_amount                               0
year                                                          0
payment_type                            

In [12]:
# number of unique HCPCS codes used

count_of_codes = df_payments_combined.hcpcs_code.nunique()
print("There are", count_of_codes, "unique HCPCS codes in 2015-2017")

There are 6763 unique HCPCS codes in 2015-2017


In [13]:
# number_of_services stats 
# min 2.4 (how is this not a round number?)
# max 7,195,536

# note: mean is 246 and median is 44

df_payments_combined.number_of_services.describe()

count    2.906023e+07
mean     2.460806e+02
std      4.828521e+03
min      2.400000e+00
25%      2.100000e+01
50%      4.400000e+01
75%      1.190000e+02
max      7.195536e+06
Name: number_of_services, dtype: float64

In [14]:
df_payments_combined.number_of_medicare_beneficiaries.describe()

count    2.906023e+07
mean     8.916477e+01
std      1.158662e+03
min      1.100000e+01
25%      1.700000e+01
50%      3.300000e+01
75%      7.600000e+01
max      7.928730e+05
Name: number_of_medicare_beneficiaries, dtype: float64

In [15]:
# mean 143.7782
# median 41

df_payments_combined.number_of_distinct_medicare_beneficiary_per_day_services.describe()

count    2.906023e+07
mean     1.437782e+02
std      2.222082e+03
min      1.100000e+01
25%      2.000000e+01
50%      4.100000e+01
75%      1.080000e+02
max      2.180422e+06
Name: number_of_distinct_medicare_beneficiary_per_day_services, dtype: float64

In [16]:
# mean 100.43
# median 63.99

# min 0.0000603 (check, this doesn't seem valid, although the numbers in this column are averages for a provider
# for a single HCPCS code so maybe it is)

df_payments_combined.average_medicare_allowed_amount.describe()

count    2.906023e+07
mean     1.004310e+02
std      2.578002e+02
min      6.035380e-05
25%      2.378000e+01
50%      6.399000e+01
75%      1.136400e+02
max      5.668435e+04
Name: average_medicare_allowed_amount, dtype: float64

Significant difference between mean and median in the stats for the measures above.

# Creating new df showing average payment by year, then by payment type, then by HCPCS code. (Code from Nicole)

In [18]:
df_avg_pmt = df_payments_combined.groupby(['year',
                                           'payment_type',
                                           'hcpcs_code']).average_medicare_allowed_amount.mean().to_frame()

In [21]:
df_avg_pmt = df_avg_pmt.reset_index()
df_avg_pmt.head()

Unnamed: 0,index,year,payment_type,hcpcs_code,average_medicare_allowed_amount
0,0,2015,Doctor & Facility,103,255.076212
1,1,2015,Doctor & Facility,104,111.614526
2,2,2015,Doctor & Facility,140,139.047955
3,3,2015,Doctor & Facility,142,102.570003
4,4,2015,Doctor & Facility,160,211.812813


In [44]:
df_avg_pmt.shape

(28879, 5)

In [60]:
df_avg_pmt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28879 entries, 0 to 28878
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   index                            28879 non-null  int64  
 1   year                             28879 non-null  int64  
 2   payment_type                     28879 non-null  object 
 3   hcpcs_code                       28879 non-null  object 
 4   average_medicare_allowed_amount  28879 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 1.1+ MB


In [22]:
df_med_services_day = df_payments_combined.groupby(['year',
                                                    'hcpcs_code']).number_of_distinct_medicare_beneficiary_per_day_services.mean().to_frame().reset_index()

df_med_services_day.head()

Unnamed: 0,year,hcpcs_code,number_of_distinct_medicare_beneficiary_per_day_services
0,2015,100,15.0
1,2015,102,26.0
2,2015,103,29.192544
3,2015,104,48.568561
4,2015,120,22.018182


In [23]:
# still seeing wide variance between mean and median
# examine outliers

df_avg_pmt.describe()

Unnamed: 0,index,year,average_medicare_allowed_amount
count,28879.0,28879.0,28879.0
mean,14439.0,2016.0098,379.808995
std,8336.793548,0.81578,1192.70932
min,0.0,2015.0,0.01
25%,7219.5,2015.0,33.868522
50%,14439.0,2016.0,124.781988
75%,21658.5,2017.0,388.973461
max,28878.0,2017.0,39279.086545


In [24]:
# still seeing wide variance between mean and median 
# examine outliers

df_med_services_day.describe()

Unnamed: 0,year,number_of_distinct_medicare_beneficiary_per_day_services
count,18053.0,18053.0
mean,2016.003601,537.8346
std,0.816364,26050.81
min,2015.0,11.0
25%,2015.0,18.25
50%,2016.0,29.98214
75%,2017.0,70.02296
max,2017.0,2180422.0


# Pivot dataframe

In [25]:
pivot_index = ['national_provider_identifier',
               'entity_type_of_the_provider', 
               'place_of_service',
               'payment_type',
               'provider_type',
               'hcpcs_code',
               'hcpcs_description',
               'zip_code_of_the_provider', 
               'state_code_of_the_provider']

pivot_cols = ['year']

In [26]:
%%time
df_pmt_pvt = df_payments_combined.pivot_table(index = pivot_index, 
                                              columns = pivot_cols, 
                                              values = 'average_medicare_allowed_amount', 
                                              aggfunc=np.mean)

df_pmt_pvt = df_pmt_pvt.reset_index()


CPU times: user 1min 31s, sys: 27.2 s, total: 1min 58s
Wall time: 2min


In [28]:
print(df_pmt_pvt.shape)
df_pmt_pvt.head()

(16132777, 12)


year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
0,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,72.68,72.743158,73.3988
1,1003000126,I,F,Doctor Only,Internal Medicine,99218,Hospital observation care typically 30 minutes,215021854,MD,,,100.08
2,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,135.85,135.01,136.38
3,1003000126,I,F,Doctor Only,Internal Medicine,99220,Hospital observation care typically 70 minutes...,215021854,MD,,189.239565,190.363729
4,1003000126,I,F,Doctor Only,Internal Medicine,99221,"Initial hospital inpatient care, typically 30 ...",215021854,MD,101.365085,100.75,101.68


In [31]:
%%time
df_bpd_pvt = df_payments_combined.pivot_table(index = pivot_index, 
                                              columns = pivot_cols, 
                                              values = 'number_of_distinct_medicare_beneficiary_per_day_services', 
                                              aggfunc=np.mean)

df_bpd_pvt = df_bpd_pvt.reset_index()


CPU times: user 1min 31s, sys: 31.8 s, total: 2min 3s
Wall time: 2min 6s


In [32]:
print(df_bpd_pvt.shape)
df_bpd_pvt.head()

(16132777, 12)


year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
0,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,23.0,57.0,100.0
1,1003000126,I,F,Doctor Only,Internal Medicine,99218,Hospital observation care typically 30 minutes,215021854,MD,,,26.0
2,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,18.0,38.0,52.0
3,1003000126,I,F,Doctor Only,Internal Medicine,99220,Hospital observation care typically 70 minutes...,215021854,MD,,23.0,59.0
4,1003000126,I,F,Doctor Only,Internal Medicine,99221,"Initial hospital inpatient care, typically 30 ...",215021854,MD,59.0,20.0,16.0


In [33]:
df_pmt_pvt.describe()

year,national_provider_identifier,2015,2016,2017
count,16132780.0,9497891.0,9714894.0,9847441.0
mean,1499742000.0,99.47302,100.1832,101.5993
std,287772200.0,238.4662,256.3474,276.4955
min,1003000000.0,0.0001026528,0.0001059603,6.03538e-05
25%,1245694000.0,23.15282,23.76625,24.26084
50%,1497968000.0,62.88879,64.09979,64.99766
75%,1740673000.0,113.8798,113.56,113.5044
max,1993000000.0,52601.47,50603.38,56684.35


In [39]:
df_pmt_pvt.columns

Index(['national_provider_identifier',  'entity_type_of_the_provider',
                   'place_of_service',                 'payment_type',
                      'provider_type',                   'hcpcs_code',
                  'hcpcs_description',     'zip_code_of_the_provider',
         'state_code_of_the_provider',                           2015,
                                 2016,                           2017],
      dtype='object', name='year')

In [48]:
df_pmt_pvt.shape

(16132777, 12)

In [42]:
df_pmt_pvt.to_pickle('../data/df_pmt_pvt.pkl')

In [43]:
df_bpd_pvt.to_pickle('../data/df_bpd_pvt.pkl')

In [50]:
df_pmt_pvt_group = df_pmt_pvt.groupby(['payment_type', 
                                       'hcpcs_code'])

df_pmt_pvt_group.head()

year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
0,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,72.680000,72.743158,73.398800
1,1003000126,I,F,Doctor Only,Internal Medicine,99218,Hospital observation care typically 30 minutes,215021854,MD,,,100.080000
2,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,135.850000,135.010000,136.380000
3,1003000126,I,F,Doctor Only,Internal Medicine,99220,Hospital observation care typically 70 minutes...,215021854,MD,,189.239565,190.363729
4,1003000126,I,F,Doctor Only,Internal Medicine,99221,"Initial hospital inpatient care, typically 30 ...",215021854,MD,101.365085,100.750000,101.680000
...,...,...,...,...,...,...,...,...,...,...,...,...
16106962,1992900526,I,F,Doctor Only,Urology,52334,Insertion of guide wire through kidney into ur...,200162618,DC,135.136316,,
16123307,1992961270,I,F,Doctor Only,Cardiac Surgery,35521,Bypass of diseased or blocked artery (arm to u...,937202693,CA,660.248182,,
16125064,1992965974,I,O,Doctor & Facility,General Surgery,64630,Destruction of pudendal (external genital) nerve,112235502,NY,223.511613,234.163636,210.968696
16126960,1992970230,O,F,Facility Only,Ambulatory Surgical Center,62287,"Aspiration of lower spine disc, accessed throu...",339195193,FL,,1847.770000,1831.410000


In [51]:
df_pmt_pvt_group.hcpcs_code.value_counts()

payment_type       hcpcs_code  hcpcs_code
Doctor & Facility  0008M       0008M           7
                   00103       00103         124
                   00104       00104          59
                   0010M       0010M           1
                   00120       00120           4
                                            ... 
Facility Only      Q4172       Q4172          10
                   Q9968       Q9968           6
                   R0070       R0070         121
                   R0075       R0075          78
                   V2785       V2785         317
Name: hcpcs_code, Length: 11179, dtype: int64

In [55]:
df_pmt_pvt_group_diff = df_pmt_pvt_group
df_pmt_pvt_group_diff.head()

year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
0,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,72.680000,72.743158,73.398800
1,1003000126,I,F,Doctor Only,Internal Medicine,99218,Hospital observation care typically 30 minutes,215021854,MD,,,100.080000
2,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,135.850000,135.010000,136.380000
3,1003000126,I,F,Doctor Only,Internal Medicine,99220,Hospital observation care typically 70 minutes...,215021854,MD,,189.239565,190.363729
4,1003000126,I,F,Doctor Only,Internal Medicine,99221,"Initial hospital inpatient care, typically 30 ...",215021854,MD,101.365085,100.750000,101.680000
...,...,...,...,...,...,...,...,...,...,...,...,...
16106962,1992900526,I,F,Doctor Only,Urology,52334,Insertion of guide wire through kidney into ur...,200162618,DC,135.136316,,
16123307,1992961270,I,F,Doctor Only,Cardiac Surgery,35521,Bypass of diseased or blocked artery (arm to u...,937202693,CA,660.248182,,
16125064,1992965974,I,O,Doctor & Facility,General Surgery,64630,Destruction of pudendal (external genital) nerve,112235502,NY,223.511613,234.163636,210.968696
16126960,1992970230,O,F,Facility Only,Ambulatory Surgical Center,62287,"Aspiration of lower spine disc, accessed throu...",339195193,FL,,1847.770000,1831.410000


In [56]:
# not working - pivot dataframe is wierd about grouping/aggregating

df_pmt_pvt__group_diff['pmt_diff_amt'] = df_pmt_pvt_group_diff[2017] - df_pmt_pvt_group_diff[2015]

TypeError: unsupported operand type(s) for -: 'SeriesGroupBy' and 'SeriesGroupBy'

In [57]:
df_pmt_pvt.to_csv('../data/df_pmt_pvt.csv')

In [59]:
df_payments_combined.to_csv('../data/df_payments_combined.csv')