### import packages

In [25]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

### read in combined data

In [26]:
df_payments_combined = pd.read_pickle('../data/1_medicare_data/pickled_files/payments_combined.pkl')

In [27]:
df_payments_combined.head()

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year
0,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99217,Hospital observation care discharge,23.0,23.0,23.0,72.68,2015
1,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99219,Hospital observation care typically 50 minutes,18.0,18.0,18.0,135.85,2015
2,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99221,"Initial hospital inpatient care, typically 30 ...",59.0,58.0,59.0,101.365085,2015
3,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99222,"Initial hospital inpatient care, typically 50 ...",132.0,130.0,132.0,139.010455,2015
4,1003000126,ENKESHAFI,I,CUMBERLAND,215021854,MD,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",220.0,215.0,220.0,205.185955,2015


### Create new column for type of payment

In [28]:
conditions = [
    (df_payments_combined.place_of_service == 'O'),
    (df_payments_combined.entity_type_of_the_provider == 'I') & (df_payments_combined.place_of_service == 'F'),
    (df_payments_combined.entity_type_of_the_provider == 'O') & (df_payments_combined.place_of_service == 'F')]

choices = ['Doctor & Facility', 'Doctor Only', 'Facility Only']

In [29]:
%%time
df_payments_combined['payment_type'] = np.select(conditions, choices, default = 'unknown')

Wall time: 8.47 s


In [30]:
df_payments_combined.payment_type.value_counts()

Doctor & Facility    17733816
Doctor Only          11048040
Facility Only          278374
Name: payment_type, dtype: int64

# Pivot to get years side by side

### Payment Data

In [31]:
df_payments_combined.columns

Index(['national_provider_identifier',
       'last_name_organization_name_of_the_provider',
       'entity_type_of_the_provider', 'city_of_the_provider',
       'zip_code_of_the_provider', 'state_code_of_the_provider',
       'provider_type', 'place_of_service', 'hcpcs_code', 'hcpcs_description',
       'number_of_services', 'number_of_medicare_beneficiaries',
       'number_of_distinct_medicare_beneficiary_per_day_services',
       'average_medicare_allowed_amount', 'year', 'payment_type'],
      dtype='object')

In [32]:
pivot_index = ['national_provider_identifier',
               'entity_type_of_the_provider', 
               'place_of_service',
               'payment_type',
               'provider_type',
               'hcpcs_code',
               'hcpcs_description',
               'zip_code_of_the_provider', 
               'state_code_of_the_provider']

pivot_cols = ['year']

In [33]:
%%time
df_pmt_pvt = df_payments_combined.pivot_table(index = pivot_index, 
                                              columns = pivot_cols, 
                                              values = 'average_medicare_allowed_amount', 
                                              aggfunc=np.mean)
df_pmt_pvt = df_pmt_pvt.reset_index()

Wall time: 2min 42s


In [34]:
print(df_pmt_pvt.shape)
df_pmt_pvt.head()

(20793075, 12)


year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
0,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,72.68,72.743158,73.3988
1,1003000126,I,F,Doctor Only,Internal Medicine,99218,Hospital observation care typically 30 minutes,215021854,MD,,,100.08
2,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,135.85,135.01,136.38
3,1003000126,I,F,Doctor Only,Internal Medicine,99220,Hospital observation care typically 70 minutes...,215021854,MD,,189.239565,190.363729
4,1003000126,I,F,Doctor Only,Internal Medicine,99221,"Initial hospital inpatient care, typically 30 ...",215021854,MD,101.365085,100.75,101.68


### Number of Services - not needed per new direction

In [35]:
'''
%%time
df_services_pvt = df_payments_combined.pivot_table(index = pivot_index, 
                                              columns = pivot_cols, 
                                              values = 'number_of_services', 
                                              aggfunc=np.mean)
df_services_pvt = df_services_pvt.reset_index()'''

"\n%%time\ndf_services_pvt = df_payments_combined.pivot_table(index = pivot_index, \n                                              columns = pivot_cols, \n                                              values = 'number_of_services', \n                                              aggfunc=np.mean)\ndf_services_pvt = df_services_pvt.reset_index()"

In [36]:
'''print(df_services_pvt.shape)
df_services_pvt.head()'''

'print(df_services_pvt.shape)\ndf_services_pvt.head()'

#### Number of beneficiaries - not needed per new direction

In [37]:
'''%%time
df_benefs_pvt = df_payments_combined.pivot_table(index = pivot_index, 
                                              columns = pivot_cols, 
                                              values = 'number_of_medicare_beneficiaries', 
                                              aggfunc=np.mean)
df_benefs_pvt = df_benefs_pvt.reset_index()'''

"%%time\ndf_benefs_pvt = df_payments_combined.pivot_table(index = pivot_index, \n                                              columns = pivot_cols, \n                                              values = 'number_of_medicare_beneficiaries', \n                                              aggfunc=np.mean)\ndf_benefs_pvt = df_benefs_pvt.reset_index()"

In [38]:
'''print(df_benefs_pvt.shape)
df_benefs_pvt.head()'''

'print(df_benefs_pvt.shape)\ndf_benefs_pvt.head()'

### Number of unique beneficieries per day

In [39]:
%%time
df_bpd_pvt = df_payments_combined.pivot_table(index = pivot_index, 
                                              columns = pivot_cols, 
                                              values = 'number_of_distinct_medicare_beneficiary_per_day_services', 
                                              aggfunc=np.mean)
df_bpd_pvt = df_bpd_pvt.reset_index()

Wall time: 2min 38s


In [40]:
print(df_bpd_pvt.shape)
df_bpd_pvt.head()

(20793075, 12)


year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
0,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,23.0,57.0,100.0
1,1003000126,I,F,Doctor Only,Internal Medicine,99218,Hospital observation care typically 30 minutes,215021854,MD,,,26.0
2,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,18.0,38.0,52.0
3,1003000126,I,F,Doctor Only,Internal Medicine,99220,Hospital observation care typically 70 minutes...,215021854,MD,,23.0,59.0
4,1003000126,I,F,Doctor Only,Internal Medicine,99221,"Initial hospital inpatient care, typically 30 ...",215021854,MD,59.0,20.0,16.0


# WILL NOT USE 
## EDA for # beneficiaries
Mary and Mahesh confirmed HCBB did suggest another metric is best for utilization. 
Assigned this metric to me in Kanban board.  Other team members to focus on other metrics.

In [41]:
#df_benefs_pvt.info(verbose = True)

In [42]:
#df_benefs_pvt.describe()

From descriptive stats:
- I means are always way higher than median, though numbers are in the same ballpark
- Looks like both metrics did lower for 2017

In [43]:
#data = df_benefs_pvt.iloc[:,9][df_benefs_pvt.iloc[:,9].notnull()]
#data.head()

In [44]:
#f, ax1 = plt.subplots()

#ax1.violinplot(data)

#ax2.violinplot(df_benefs_pvt['2016'], showmeans=True, showextrema=True, showmedians=True)

#ax3.violinplot(df_benefs_pvt['2016'], showmeans=True, showextrema=True, showmedians=True)

In [45]:
#data.hist()

Data is very much skewed to the right.  Will need to subset somehow to look at this. 

## EDA for payment

In [46]:
df_pmt_pvt['change_15_to_16'] = df_pmt_pvt[2016] - df_pmt_pvt[2015]
df_pmt_pvt['change_16_to_17'] = df_pmt_pvt[2017] - df_pmt_pvt[2016]
df_pmt_pvt['change_15_to_17'] = df_pmt_pvt[2017] - df_pmt_pvt[2015]

In [47]:
df_pmt_pvt.head()

year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017,change_15_to_16,change_16_to_17,change_15_to_17
0,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,72.68,72.743158,73.3988,0.063158,0.655642,0.7188
1,1003000126,I,F,Doctor Only,Internal Medicine,99218,Hospital observation care typically 30 minutes,215021854,MD,,,100.08,,,
2,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,135.85,135.01,136.38,-0.84,1.37,0.53
3,1003000126,I,F,Doctor Only,Internal Medicine,99220,Hospital observation care typically 70 minutes...,215021854,MD,,189.239565,190.363729,,1.124164,
4,1003000126,I,F,Doctor Only,Internal Medicine,99221,"Initial hospital inpatient care, typically 30 ...",215021854,MD,101.365085,100.75,101.68,-0.615085,0.93,0.314915


In [48]:
df_pmt_pvt.describe()

year,national_provider_identifier,2015,2016,2017,change_15_to_16,change_16_to_17,change_15_to_17
count,20793080.0,9497891.0,9714894.0,9847441.0,3026202.0,3883872.0,2913220.0
mean,1503401000.0,99.47302,100.1832,101.5993,-0.8862161,0.5967683,-0.5316401
std,286352300.0,238.4662,256.3474,276.4955,36.76731,29.48526,49.09106
min,1003000000.0,0.0001026528,0.0001059603,6.03538e-05,-24457.91,-12075.91,-30353.65
25%,1255400000.0,23.15282,23.76625,24.26084,-0.795,-0.07,-0.5469656
50%,1508809000.0,62.88879,64.09979,64.99766,-0.08,0.21,0.08
75%,1750349000.0,113.8798,113.56,113.5044,0.2186936,0.98,0.826
max,1993000000.0,52601.47,50603.38,56684.35,22840.54,17570.79,34305.38


### Pivot, aggregating at HCPCS and payment type level

In [50]:
df_pmt_pvt.columns

Index(['national_provider_identifier',  'entity_type_of_the_provider',
                   'place_of_service',                 'payment_type',
                      'provider_type',                   'hcpcs_code',
                  'hcpcs_description',     'zip_code_of_the_provider',
         'state_code_of_the_provider',                           2015,
                                 2016,                           2017,
                    'change_15_to_16',              'change_16_to_17',
                    'change_15_to_17'],
      dtype='object', name='year')

In [54]:
df_pmt_pvt.columns = df_pmt_pvt.columns.astype(str)
df_pmt_pvt.columns

Index(['national_provider_identifier', 'entity_type_of_the_provider',
       'place_of_service', 'payment_type', 'provider_type', 'hcpcs_code',
       'hcpcs_description', 'zip_code_of_the_provider',
       'state_code_of_the_provider', '2015', '2016', '2017', 'change_15_to_16',
       'change_16_to_17', 'change_15_to_17'],
      dtype='object', name='year')

In [69]:
%%time
pivot_index2 = ['hcpcs_code',
                'hcpcs_description',
                'payment_type']
                

pivot_vals2 = ['2015',
               '2016',
               '2017',
               'change_15_to_16',
               'change_16_to_17',
               'change_15_to_17']

df_pmt_avg_pvt = df_pmt_pvt.pivot_table(index = pivot_index2,
                                        columns = None,
                                        values = pivot_vals2,
                                        aggfunc = [np.mean,np.median])

Wall time: 17.8 s


In [67]:
df_pmt_avg_pvt

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,mean,mean,mean,mean,mean,median,median,median,median,median,median
Unnamed: 0_level_1,Unnamed: 1_level_1,year,2015,2016,2017,change_15_to_16,change_15_to_17,change_16_to_17,2015,2016,2017,change_15_to_16,change_15_to_17,change_16_to_17
hcpcs_code,hcpcs_description,payment_type,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
0008M,Onc breast risk score,Doctor & Facility,,3419.419776,3443.360000,,,23.940000,,3419.420000,3443.360000,,,23.940000
00100,Anesthesia for procedure on salivary gland with biopsy,Doctor Only,266.582807,290.975942,269.853470,,,-23.151259,286.120357,363.637143,296.281818,,,-23.151259
00102,Anesthesia for procedure to repair lip defect present at birth,Doctor Only,95.198939,,,,,,94.151639,,,,,
00103,Anesthesia for procedure on eyelid,Doctor & Facility,255.076212,128.268843,118.002858,-54.043743,,0.703266,246.696761,111.403636,99.734615,-54.043743,,-1.538811
00103,Anesthesia for procedure on eyelid,Doctor Only,153.325203,148.615364,150.600096,-4.950231,-3.986901,1.741500,158.016447,154.506923,156.974462,-4.889702,-2.699753,0.739879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R0075,"Transportation of portable x-ray equipment and personnel to home or nursing home, per trip to facility or location, more than one patient seen",Facility Only,81.782458,80.305087,80.649999,-1.189288,0.709595,2.784833,80.025000,81.824286,81.612353,-1.189288,0.709595,0.559016
V2632,Posterior chamber intraocular lens,Doctor & Facility,113.490000,113.040000,113.830000,,0.340000,,113.490000,113.040000,113.830000,,0.340000,
V2785,"Processing, preserving and transporting corneal tissue",Doctor & Facility,,1172.601191,1198.226601,,,,,1172.601191,1198.226601,,,
V2785,"Processing, preserving and transporting corneal tissue",Doctor Only,3735.000000,3925.000000,,,,,3735.000000,3925.000000,,,,


In [73]:
df_pmt_pvt[df_pmt_pvt.hcpcs_code=='00103'].describe()

year,national_provider_identifier,2015,2016,2017,change_15_to_16,change_16_to_17,change_15_to_17
count,6076.0,2441.0,2567.0,2463.0,399.0,842.0,320.0
mean,1496940000.0,153.491939,148.108088,149.369263,-5.073272,1.715605,-3.986901
std,288142100.0,49.207127,47.956377,47.452792,14.021499,15.539966,17.280712
min,1003024000.0,74.019412,59.754545,62.554211,-54.043743,-80.562174,-83.324048
25%,1255322000.0,105.898824,101.580454,102.152,-10.92175,-5.176057,-11.264643
50%,1497745000.0,158.184107,153.505,155.345385,-4.921071,0.692032,-2.699753
75%,1740204000.0,189.027683,182.826879,183.254902,1.029494,6.765428,4.799938
max,1992988000.0,382.6225,340.486471,333.142857,66.494943,90.961389,68.546287


averaging changes before grouping does not equal the change between averages. 