In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

Prescriber Data  
https://data.cms.gov/Medicare-Part-D/Medicare-Provider-Utilization-and-Payment-Data-201/77gb-8z53  
Physician Compare data  
https://data.medicare.gov/Physician-Compare/Physician-Compare-National-Downloadable-File/mj5m-pzi6


#### read in all columns, rename some columns 

In [3]:
%%time
prescriber_df = pd.read_csv('../data/archive/Medicare_Provider_Utilization_and_Payment_Data__2017_Part_D_Prescriber.csv')
prescriber_df = prescriber_df.rename(columns={'nppes_provider_last_org_name':'last_name','nppes_provider_first_name':'first_name',
                                              'nppes_provider_city':'city','nppes_provider_state': 'state',
                                              'specialty_description':'specialty','generic_name':'drug',})

Wall time: 3min 47s


#### create list of antipsych drugs

In [4]:
list_of_all_antipsychs = ['ARIPIPRAZOLE',
'ARIPIPRAZOLE LAUROXIL',
'CHLORPROMAZINE HCL',
'CLOZAPINE',
'ILOPERIDONE',
'FLUPHENAZINE DECANOATE',
'FLUPHENAZINE HCL',
'ZIPRASIDONE HCL',
'ZIPRASIDONE MESYLATE',
'HALOPERIDOL LACTATE',
'HALOPERIDOL DECANOATE',
'HALOPERIDOL',
'PALIPERIDONE',
'PALIPERIDONE PALMITATE',
'LURASIDONE HCL',
'LOXAPINE SUCCINATE',
'MOLINDONE HCL',
'PIMAVANSERIN TARTRATE',
'OLANZAPINE',
'OLANZAPINE/FLUOXETINE HCL',
'PIMOZIDE',
'PERPHENAZINE',
'PERPHENAZINE/AMITRIPTYLINE HCL',
'QUETIAPINE FUMARATE',
'BREXPIPRAZOLE',
'RISPERIDONE',
'RISPERIDONE MICROSPHERES',
'ASENAPINE MALEATE',
'THIORIDAZINE HCL',
'THIOTHIXENE',
'TRIFLUOPERAZINE HCL',
'CARIPRAZINE HCL',
'OLANZAPINE PAMOATE']

In [5]:
prescriber_df.shape

(25209130, 21)

#### Run query on dataframe for list of drugs (credit: Mary's Queries)

In [6]:
prescriber_df = prescriber_df.query('drug in @list_of_all_antipsychs')

In [7]:
prescriber_df.shape

(509235, 21)

#### Some EDA:
    are there duplicates? there shouldn't be
    notnulls
    how many unique values for each column?
    

In [8]:
prescriber_df_drop_dup = prescriber_df.drop_duplicates()
prescriber_df_drop_dup.shape

(509235, 21)

In [9]:
prescriber_df.total_claim_count_ge65.sum()

7536552.0

In [10]:
prescriber_df.bene_count_ge65.sum()

294370.0

In [11]:
prescriber_df.notnull().sum()

npi                              509235
last_name                        509233
first_name                       509232
city                             509235
state                            509235
specialty                        509235
description_flag                 509235
drug_name                        509235
drug                             509235
bene_count                        99247
total_claim_count                509235
total_30_day_fill_count          509235
total_day_supply                 509235
total_drug_cost                  509235
bene_count_ge65                  152815
bene_count_ge65_suppress_flag    356420
total_claim_count_ge65           347730
ge65_suppress_flag               161505
total_30_day_fill_count_ge65     347730
total_day_supply_ge65            347730
total_drug_cost_ge65             347730
dtype: int64

In [12]:
prescriber_df.nunique()

npi                              149764
last_name                         62083
first_name                        20671
city                               8361
state                                59
specialty                           118
description_flag                      2
drug_name                            57
drug                                 33
bene_count                          251
total_claim_count                  1192
total_30_day_fill_count            6369
total_day_supply                  14128
total_drug_cost                  329524
bene_count_ge65                     161
bene_count_ge65_suppress_flag         2
total_claim_count_ge65              755
ge65_suppress_flag                    2
total_30_day_fill_count_ge65       3613
total_day_supply_ge65              8075
total_drug_cost_ge65             139208
dtype: int64

In [13]:
prescriber_df[prescriber_df['bene_count_ge65'].isnull()]

Unnamed: 0,npi,last_name,first_name,city,state,specialty,description_flag,drug_name,drug,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65
47,1417981960,TILLMANNS,REGINE,MELROSE,MA,Internal Medicine,S,QUETIAPINE FUMARATE,QUETIAPINE FUMARATE,,13,23.0,680,106.34,,*,,*,,,
103,1497031454,ORTIZ,CHRISTIAN,DALLAS,TX,Psychiatry,S,HALOPERIDOL,HALOPERIDOL,,19,19.0,570,414.11,,*,,*,,,
105,1437247335,AFSHAR,PARVIN,IRVINE,CA,Psychiatry,S,SAPHRIS,ASENAPINE MALEATE,,11,11.0,320,9079.50,,*,11.0,,11.0,320.0,9079.50
190,1205899572,SARKAR,PURNIMA,EARLE,AR,Internal Medicine,S,HALOPERIDOL,HALOPERIDOL,,23,23.2,695,1390.46,,*,11.0,,11.0,330.0,258.83
215,1841603305,KING,SARA,WATERLOO,IA,Nurse Practitioner,S,ARIPIPRAZOLE,ARIPIPRAZOLE,,30,30.2,855,6001.11,,#,,#,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25208581,1598188617,CURTISS,LAWRENCE,MOLINE,IL,Nurse Practitioner,S,RISPERIDONE,RISPERIDONE,42.0,337,363.9,10324,3421.38,,*,58.0,,62.0,1834.0,410.53
25208619,1962452391,MAGNUSON,THOMAS,OMAHA,NE,Geriatric Psychiatry,S,CLOZAPINE,CLOZAPINE,,25,25.0,648,3886.51,,*,25.0,,25.0,648.0,3886.51
25208680,1801996764,MCLAUGHLIN,JULE,BOULDER,CO,Physician Assistant,S,OLANZAPINE,OLANZAPINE,20.0,152,182.0,5239,2998.17,,*,19.0,,21.0,630.0,255.61
25208800,1558377648,MALLOZZI,MARINO,EPHRATA,PA,Internal Medicine,S,QUETIAPINE FUMARATE ER,QUETIAPINE FUMARATE,,12,20.0,600,2781.07,,*,,*,,,


#### We can see ^ that there are 356,420 rows with NaN value for the bene_count_ge65...
    This is because the data is suppressed for any beneficiary count below 11 in order to protect privacy.
    I will fill these NaN values with 5 (a presumed average).

In [14]:
prescriber_df['bene_count_ge65'] = prescriber_df['bene_count_ge65'].fillna(5)

In [15]:
prescriber_df.bene_count_ge65.value_counts()

5.0      356420
0.0      138976
11.0       1955
12.0       1596
13.0       1274
14.0       1065
15.0        904
16.0        698
17.0        643
18.0        558
19.0        506
20.0        432
21.0        362
22.0        309
23.0        255
24.0        243
25.0        238
26.0        203
27.0        195
29.0        159
28.0        155
31.0        139
30.0        139
33.0        122
32.0        111
34.0         99
35.0         93
36.0         81
38.0         76
37.0         74
39.0         63
41.0         59
42.0         52
44.0         51
40.0         49
43.0         36
50.0         35
48.0         34
47.0         33
55.0         32
45.0         32
51.0         31
49.0         31
46.0         29
57.0         25
53.0         23
54.0         23
56.0         21
62.0         21
59.0         20
58.0         19
67.0         17
65.0         17
61.0         16
52.0         16
66.0         15
72.0         13
63.0         13
76.0         12
68.0         12
64.0         12
73.0         11
70.0    

#### Now what are all these zeros?  hmmm

new_df = old_df[((old_df['C1'] > 0)
                & (old_df['C1'] < 20))] 
                


In [33]:
#prescriber_df[(prescriber_df.bene_count_ge65 == 5) ].total_claim_count_ge65.value_counts()
prescriber_df.total_claim_count_ge65.isnull()
              ##& (prescriber_df.total_claim_count_ge65 > 0)]

26          False
47           True
103          True
105         False
190         False
            ...  
25208680    False
25208800     True
25208880     True
25208944    False
25209124    False
Name: total_claim_count_ge65, Length: 509235, dtype: bool

In [88]:
prescriber_df = prescriber_df[prescriber_df.bene_count_ge65 != 0]
prescriber_df.shape

(370259, 21)

### Below is some more eda that has not materially impacted my presentation yet. 
    eventually, I would like to use the physician compare data to link antipsychotic use with specific hospitals

In [89]:
## confirming that bene_count_ge65 is included in bene_countS
prescriber_df[prescriber_df.bene_count < prescriber_df.bene_count_ge65]

Unnamed: 0,npi,last_name,first_name,city,state,specialty,description_flag,drug_name,drug,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65


In [90]:
## there are 138,976 rows with '0'for bene_count_ge65
## confirming that these are for rows that have data for beneficiaries younger than 65
prescriber_df[prescriber_df.bene_count_ge65 == 0].head()

Unnamed: 0,npi,last_name,first_name,city,state,specialty,description_flag,drug_name,drug,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65


In [91]:
total_benef = prescriber_df.groupby(['state'])['bene_count_ge65'].sum().to_frame().sort_values(by= ['bene_count_ge65'],ascending=False)

In [92]:
ak_df = prescriber_df.loc[prescriber_df['state']=='AK']

In [93]:
ak_df.bene_count_ge65.sum()

2642.0

In [94]:
ak_df.npi.count()

526

#### Bring in physician data and merge

In [95]:
physician_df = pd.read_csv('../data/archive/Physician_Compare_National_Downloadable_File.csv')
physician_df = physician_df.rename(columns={'NPI': 'npi2','Hospital affiliation CCN 1':'ccn1','Hospital affiliation LBN 1':'lbn1',
                                            'Hospital affiliation CCN 2':'ccn2','Hospital affiliation LBN 2':'lbn2',
                                            'Hospital affiliation CCN 3':'ccn3','Hospital affiliation LBN 3':'lbn3',
                                            'Hospital affiliation CCN 4':'ccn4','Hospital affiliation LBN 4':'lbn4',
                                            'Hospital affiliation CCN 5':'ccn5','Hospital affiliation LBN 5':'lbn5'})

  interactivity=interactivity, compiler=compiler, result=result)


In [96]:
physician_df.shape

(2183992, 38)

In [97]:
physician_df = physician_df.drop_duplicates(subset='npi2')
physician_df.shape

(1138209, 38)

In [98]:
physician_df.npi2.notnull().sum()

1138209

In [99]:
physician_df.npi2.nunique()

1138209

In [100]:
prescriber_and_physician_df = pd.merge(prescriber_df,physician_df,how= left_on='npi',right_on='npi2')
prescriber_and_physician_df.head(1)

SyntaxError: invalid syntax (<ipython-input-100-b2773cd13e12>, line 1)

In [None]:
prescriber_and_physician_df.shape

In [None]:
prescriber_and_physician_df = prescriber_and_physician_df.drop_duplicates()

In [None]:
prescriber_and_physician_df.shape

In [None]:
prescriber_and_physician_df.head(2)

In [None]:
prescriber_and_physician_df.shape

In [None]:
prescriber_and_physician_df.to_csv('../data/1.csv', index = False)