In [109]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# bring in combined pickle file all years 2015-2017


In [97]:
df_payments_combined = pd.read_pickle('../data/df_payments_combined.pkl')

- Chart showing payments (average_medicare_allowed_amount) over time
  -Two charts, based on Entity (I, O) for Individual and for Organization
- Chart showing counts over time (count of Beneficiaries and count of Services)

# Creating New Column: payment_type

- Entity I, POS F = Doctor Only
- Entity O, POS F = Facility Only
- Entity I, POS O = Doctor & Facility
- Entity O, POS O = Doctor & Facility

In [98]:
conditions = [
    (df_payments_combined['place_of_service'] == 'O'),
    (df_payments_combined['entity_type_of_the_provider'] == 'I') & (df_payments_combined['place_of_service'] == 'F'),
    (df_payments_combined['entity_type_of_the_provider'] == 'O') & (df_payments_combined['place_of_service'] == 'F')]

choices = ['Doctor & Facility','Doctor Only', 'Facility Only']

In [99]:
df_payments_combined['payment_type'] = np.select(conditions, choices, default='unknown')

In [100]:
df_payments_combined.tail()

Unnamed: 0,national_provider_identifier,last_name_organization_name_of_the_provider,entity_type_of_the_provider,city_of_the_provider,zip_code_of_the_provider,state_code_of_the_provider,provider_type,place_of_service,hcpcs_code,hcpcs_description,number_of_services,number_of_medicare_beneficiaries,number_of_distinct_medicare_beneficiary_per_day_services,average_medicare_allowed_amount,year,payment_type
29060225,1992999825,DESCHENES,I,SEATTLE,981012756,WA,Otolaryngology,O,99214,Established patient office or other outpatient...,248.0,175.0,248.0,116.86,2017,Doctor & Facility
29060226,1992999874,JOFFE,I,MECHANICSVILLE,231161844,VA,Internal Medicine,F,99223,"Initial hospital inpatient care, typically 70 ...",41.0,41.0,41.0,203.4,2017,Doctor Only
29060227,1992999874,JOFFE,I,MECHANICSVILLE,231161844,VA,Internal Medicine,F,99232,"Subsequent hospital inpatient care, typically ...",101.0,57.0,101.0,72.48,2017,Doctor Only
29060228,1992999874,JOFFE,I,MECHANICSVILLE,231161844,VA,Internal Medicine,F,99233,"Subsequent hospital inpatient care, typically ...",102.0,55.0,102.0,104.76,2017,Doctor Only
29060229,1992999874,JOFFE,I,MECHANICSVILLE,231161844,VA,Internal Medicine,F,99239,"Hospital discharge day management, more than 3...",49.0,49.0,49.0,107.98,2017,Doctor Only


In [101]:
df_payments_combined['payment_type'].value_counts()

Doctor & Facility    17733816
Doctor Only          11048039
Facility Only          278374
unknown                     1
Name: payment_type, dtype: int64

# Groupby NPI/HCPCS/POS/Entity; POS/Entity is now payment_type

In [102]:
%%time
df_avg_pmt = df_payments_combined.groupby(['year',
                                           'payment_type',
                                           'hcpcs_code']).average_medicare_allowed_amount.mean().to_frame().reset_index()

CPU times: user 9.43 s, sys: 3.85 s, total: 13.3 s
Wall time: 13.9 s


In [103]:
df_avg_pmt

Unnamed: 0,year,payment_type,hcpcs_code,average_medicare_allowed_amount
0,2015,Doctor & Facility,00103,255.076212
1,2015,Doctor & Facility,00104,111.614526
2,2015,Doctor & Facility,00140,139.047955
3,2015,Doctor & Facility,00142,102.570003
4,2015,Doctor & Facility,00160,211.812813
...,...,...,...,...
28874,2017,Facility Only,Q4172,117.524101
28875,2017,Facility Only,Q9968,3.404030
28876,2017,Facility Only,R0070,180.445074
28877,2017,Facility Only,R0075,80.649999


# Measure how payments and counts change over time
“Counts” = count of Beneficiaries and count of Services

Mean of number_of_distinct_medicare_beneficiary_per_day_services PER hcpcs PER year

In [104]:
df_avg_services_day = df_payments_combined.groupby(['year',
                                                    'hcpcs_code']).number_of_distinct_medicare_beneficiary_per_day_services.mean().to_frame().reset_index()

In [105]:
df_avg_services_day

Unnamed: 0,year,hcpcs_code,number_of_distinct_medicare_beneficiary_per_day_services
0,2015,00100,15.000000
1,2015,00102,26.000000
2,2015,00103,29.192544
3,2015,00104,48.568561
4,2015,00120,22.018182
...,...,...,...
18048,2017,Q9989,16.000000
18049,2017,R0070,2870.982456
18050,2017,R0075,1917.967213
18051,2017,V2632,89.000000


Median of number_of_distinct_medicare_beneficiary_per_day_services PER hcpcs PER year

In [106]:
df_med_services_day = df_payments_combined.groupby(['year',
                                                    'hcpcs_code']).number_of_distinct_medicare_beneficiary_per_day_services.median().to_frame().reset_index()

In [107]:
df_med_services_day

Unnamed: 0,year,hcpcs_code,number_of_distinct_medicare_beneficiary_per_day_services
0,2015,00100,13.5
1,2015,00102,18.0
2,2015,00103,19.0
3,2015,00104,26.0
4,2015,00120,15.0
...,...,...,...
18048,2017,Q9989,16.0
18049,2017,R0070,676.0
18050,2017,R0075,302.5
18051,2017,V2632,89.0


# Pivot Dataframes - for years to be side by side (Code from Diego)

Payment Data

In [125]:
df_payments_combined.columns

Index(['national_provider_identifier',
       'last_name_organization_name_of_the_provider',
       'entity_type_of_the_provider', 'city_of_the_provider',
       'zip_code_of_the_provider', 'state_code_of_the_provider',
       'provider_type', 'place_of_service', 'hcpcs_code', 'hcpcs_description',
       'number_of_services', 'number_of_medicare_beneficiaries',
       'number_of_distinct_medicare_beneficiary_per_day_services',
       'average_medicare_allowed_amount', 'year', 'payment_type'],
      dtype='object')

In [126]:
pivot_index = ['national_provider_identifier',
               'entity_type_of_the_provider', 
               'place_of_service',
               'payment_type',
               'provider_type',
               'hcpcs_code',
               'hcpcs_description',
               'zip_code_of_the_provider', 
               'state_code_of_the_provider']

pivot_cols = ['year'] #should this be index?

Avg Medicare Allowed Amount

In [129]:
%%time
df_pmt_pvt = df_payments_combined.pivot_table(index = pivot_index, 
                                              columns = pivot_cols, 
                                              values = 'average_medicare_allowed_amount', 
                                              aggfunc=np.mean)
df_pmt_pvt = df_pmt_pvt.reset_index()

CPU times: user 1min 47s, sys: 32 s, total: 2min 19s
Wall time: 2min 21s


In [130]:
print(df_pmt_pvt.shape)
df_pmt_pvt.head()

(22077080, 12)


year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
0,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,72.68,72.743158,
1,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,,,73.3988
2,1003000126,I,F,Doctor Only,Internal Medicine,99218,Hospital observation care typically 30 minutes,215021854,MD,,,100.08
3,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,135.85,135.01,
4,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,,,136.38


# Analysis by # of beneficiaries/day (based on Number of Distinct Medicare Beneficiary/Per Day Services)

In [249]:
%%time
df_bpd_pvt = df_payments_combined.pivot_table(index = pivot_index, 
                                              columns = pivot_cols, 
                                              values = 'number_of_distinct_medicare_beneficiary_per_day_services', 
                                              aggfunc=np.mean)
df_bpd_pvt = df_bpd_pvt.reset_index()

CPU times: user 1min 42s, sys: 30.1 s, total: 2min 12s
Wall time: 2min 13s


In [250]:
print(df_bpd_pvt.shape)
df_bpd_pvt.head()

(22077080, 12)


year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
0,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,23.0,57.0,
1,1003000126,I,F,Doctor Only,Internal Medicine,99217,Hospital observation care discharge,215021854,MD,,,100.0
2,1003000126,I,F,Doctor Only,Internal Medicine,99218,Hospital observation care typically 30 minutes,215021854,MD,,,26.0
3,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,18.0,38.0,
4,1003000126,I,F,Doctor Only,Internal Medicine,99219,Hospital observation care typically 50 minutes,215021854,MD,,,52.0


In [227]:
df_bpd_pvt.describe(include = 'all')

year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
count,22077080.0,22077080,22077080,22077080,22077080,22077080.0,22077080,22077080.0,22077080,9497890.0,9714894.0,9847441.0
unique,,2,2,3,122,6763.0,6187,610939.0,61,,,
top,,I,O,Doctor & Facility,Diagnostic Radiology,99213.0,Established patient office or other outpatient...,559050001.0,CA,,,
freq,,21063281,13531659,13531659,2714019,975285.0,975285,30297.0,1690458,,,
mean,1497724000.0,,,,,,,,,147.4233,143.705,140.3349
std,288694600.0,,,,,,,,,2337.895,2329.901,1985.972
min,1003000000.0,,,,,,,,,11.0,11.0,11.0
25%,1245361000.0,,,,,,,,,20.0,20.0,20.0
50%,1497843000.0,,,,,,,,,42.0,41.0,40.0
75%,1740495000.0,,,,,,,,,110.0,108.0,106.0


In [228]:
df_bpd_pvt.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22077080 entries, 0 to 22077079
Data columns (total 12 columns):
 #   Column                        Non-Null Count     Dtype  
---  ------                        --------------     -----  
 0   national_provider_identifier  22077080 non-null  int64  
 1   entity_type_of_the_provider   22077080 non-null  object 
 2   place_of_service              22077080 non-null  object 
 3   payment_type                  22077080 non-null  object 
 4   provider_type                 22077080 non-null  object 
 5   hcpcs_code                    22077080 non-null  object 
 6   hcpcs_description             22077080 non-null  object 
 7   zip_code_of_the_provider      22077080 non-null  object 
 8   state_code_of_the_provider    22077080 non-null  object 
 9   2015                          9497890 non-null   float64
 10  2016                          9714894 non-null   float64
 11  2017                          9847441 non-null   float64
dtypes: float64(3

if 2015 & 2017 values are null, drop/exclude from dataframe

In [242]:
df_bpd_pvt = df_bpd_pvt[df_bpd_pvt[2015].notna()]
df_bpd_pvt = df_bpd_pvt[df_bpd_pvt[2017].notna()]
df_bpd_pvt.tail()

year,national_provider_identifier,entity_type_of_the_provider,place_of_service,payment_type,provider_type,hcpcs_code,hcpcs_description,zip_code_of_the_provider,state_code_of_the_provider,2015,2016,2017
22077063,1992999825,I,O,Doctor & Facility,Otolaryngology,99203,"New patient office or other outpatient visit, ...",981012756,WA,47.0,65.0,88.0
22077064,1992999825,I,O,Doctor & Facility,Otolaryngology,99204,"New patient office or other outpatient visit, ...",981012756,WA,147.0,130.0,105.0
22077066,1992999825,I,O,Doctor & Facility,Otolaryngology,99214,Established patient office or other outpatient...,981012756,WA,220.0,268.0,248.0
22077073,1992999874,I,F,Doctor Only,Internal Medicine,99232,"Subsequent hospital inpatient care, typically ...",231161844,VA,334.0,,101.0
22077075,1992999874,I,F,Doctor Only,Internal Medicine,99233,"Subsequent hospital inpatient care, typically ...",231161844,VA,292.0,,102.0


In [243]:
df_bpd_pvt.shape

(2067642, 12)

Create column of bpd_change from 2015 to 2017

In [252]:
df_bpd_pvt['bpd_change'] = df_bpd_pvt[2017] - df_bpd_pvt[2015]

In [253]:
#remove npi -should that be included?  added 2015 & 2017 columns from original code #do we need hcpcs description here?
df_bpd_pvt = df_bpd_pvt.groupby(['national_provider_identifier',
                                 'hcpcs_code','payment_type',2015,2017]).bpd_change.mean().to_frame().reset_index()
df_bpd_pvt

Unnamed: 0,national_provider_identifier,hcpcs_code,payment_type,2015,2017,bpd_change
0,1003000597,99222,Doctor Only,18.0,15.0,-3.0
1,1003000738,29826,Doctor Only,17.0,28.0,11.0
2,1003000902,81002,Doctor & Facility,44.0,35.0,-9.0
3,1003000902,99203,Doctor & Facility,50.0,35.0,-15.0
4,1003000902,99212,Doctor & Facility,16.0,23.0,7.0
...,...,...,...,...,...,...
2067637,1992999825,99204,Doctor & Facility,147.0,105.0,-42.0
2067638,1992999825,99214,Doctor & Facility,220.0,248.0,28.0
2067639,1992999825,99214,Doctor Only,155.0,103.0,-52.0
2067640,1992999874,99232,Doctor Only,334.0,101.0,-233.0


Code below from Ari to sort by max to min

In [254]:
df_bpd_sorted=df_bpd_pvt.sort_values('bpd_change',ascending=False)
df_bpd_sorted.reset_index()

Unnamed: 0,index,national_provider_identifier,hcpcs_code,payment_type,2015,2017,bpd_change
0,1969773,1952347791,36415,Doctor & Facility,113068.0,373124.0,260056.0
1,613782,1295823540,P9603,Doctor & Facility,56760.0,239422.0,182662.0
2,1969775,1952347791,80061,Doctor & Facility,54710.0,174170.0,119460.0
3,1969853,1952347791,84443,Doctor & Facility,40376.0,132794.0,92418.0
4,1927660,1932145778,83036,Doctor & Facility,244719.0,336444.0,91725.0
...,...,...,...,...,...,...,...
2067637,1439863,1699782722,82542,Doctor & Facility,153274.0,27054.0,-126220.0
2067638,358277,1174569909,36415,Doctor & Facility,225775.0,43276.0,-182499.0
2067639,1030032,1497933162,82570,Doctor & Facility,244910.0,118.0,-244792.0
2067640,1030034,1497933162,84311,Doctor & Facility,245068.0,118.0,-244950.0


In [235]:
#groupby hcpcs_code and payment_type 
#df_bpd_sorted = df_bpd_pvt.groupby(['payment_type','hcpcs_code']).bpd_change.mean().to_frame().reset_index()
#df_bpd_sorted.sort_values('bpd_change',ascending=False)

In [255]:
#calculate percent change - possible absolute value of change?
df_bpd_sorted['%_Change']=  ((df_bpd_pvt['bpd_change'])/(df_bpd_pvt[2017])*100) 

In [256]:
df_bpd_sorted

Unnamed: 0,national_provider_identifier,hcpcs_code,payment_type,2015,2017,bpd_change,%_Change
1969773,1952347791,36415,Doctor & Facility,113068.0,373124.0,260056.0,69.696937
613782,1295823540,P9603,Doctor & Facility,56760.0,239422.0,182662.0,76.292905
1969775,1952347791,80061,Doctor & Facility,54710.0,174170.0,119460.0,68.588161
1969853,1952347791,84443,Doctor & Facility,40376.0,132794.0,92418.0,69.595012
1927660,1932145778,83036,Doctor & Facility,244719.0,336444.0,91725.0,27.263081
...,...,...,...,...,...,...,...
1439863,1699782722,82542,Doctor & Facility,153274.0,27054.0,-126220.0,-466.548385
358277,1174569909,36415,Doctor & Facility,225775.0,43276.0,-182499.0,-421.709493
1030032,1497933162,82570,Doctor & Facility,244910.0,118.0,-244792.0,-207450.847458
1030034,1497933162,84311,Doctor & Facility,245068.0,118.0,-244950.0,-207584.745763


In [237]:
df_bpd_sorted['bpd_change'].sort_values().head(20) #keep the highest change values - top 20? (+ or -)

321443    -245322.0
327259    -244950.0
291479    -244792.0
107716    -182499.0
286120    -126220.0
245560     -77102.0
245562     -74263.0
107710     -69444.0
1539792    -69084.0
336633     -68381.0
1541422    -67366.0
245557     -66127.0
352303     -63591.0
372592     -58458.0
352298     -56640.0
874205     -51670.0
107722     -47177.0
336616     -42540.0
362821     -42251.0
251946     -41884.0
Name: bpd_change, dtype: float64

In [238]:
df_bpd_sorted['bpd_change'].sort_values().tail(20)

251971     52725.0
401891     52970.0
311691     54770.0
107720     54889.0
386174     55862.0
251968     63476.0
251961     66354.0
311690     68287.0
840726     77526.0
291471     77648.0
245573     81532.0
107723     83084.0
352315     84227.0
245582     88022.0
245580     89263.0
311688     91725.0
336614     92418.0
251949    119460.0
907258    182662.0
107711    260056.0
Name: bpd_change, dtype: float64

In [239]:
df_bpd_sorted['bpd_change'].value_counts(ascending=False)

 0.0        23836
 1.0        23390
-1.0        23333
 2.0        22775
-2.0        22612
            ...  
-20298.0        1
 2528.0         1
-2253.0         1
 5790.0         1
 4731.0         1
Name: bpd_change, Length: 5961, dtype: int64

In [None]:
#sns.pairplot(df_bpd_pvt)