In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [14]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

Prescriber Data  
https://data.cms.gov/Medicare-Part-D/Medicare-Provider-Utilization-and-Payment-Data-201/77gb-8z53  
Physician Compare data  
https://data.medicare.gov/Physician-Compare/Physician-Compare-National-Downloadable-File/mj5m-pzi6


#### read in all columns, rename some columns 

In [3]:
%%time
prescriber_df = pd.read_csv('../data/archive/Medicare_Provider_Utilization_and_Payment_Data__2017_Part_D_Prescriber.csv')
prescriber_df = prescriber_df.rename(columns={'nppes_provider_last_org_name':'last_name','nppes_provider_first_name':'first_name',
                                              'nppes_provider_city':'city','nppes_provider_state': 'state',
                                              'specialty_description':'specialty','generic_name':'drug',})

Wall time: 2min 57s


#### create list of antipsych drugs

In [4]:
list_of_all_antipsychs = ['ARIPIPRAZOLE',
'ARIPIPRAZOLE LAUROXIL',
'CHLORPROMAZINE HCL',
'CLOZAPINE',
'ILOPERIDONE',
'FLUPHENAZINE DECANOATE',
'FLUPHENAZINE HCL',
'ZIPRASIDONE HCL',
'ZIPRASIDONE MESYLATE',
'HALOPERIDOL LACTATE',
'HALOPERIDOL DECANOATE',
'HALOPERIDOL',
'PALIPERIDONE',
'PALIPERIDONE PALMITATE',
'LURASIDONE HCL',
'LOXAPINE SUCCINATE',
'MOLINDONE HCL',
'PIMAVANSERIN TARTRATE',
'OLANZAPINE',
'OLANZAPINE/FLUOXETINE HCL',
'PIMOZIDE',
'PERPHENAZINE',
'PERPHENAZINE/AMITRIPTYLINE HCL',
'QUETIAPINE FUMARATE',
'BREXPIPRAZOLE',
'RISPERIDONE',
'RISPERIDONE MICROSPHERES',
'ASENAPINE MALEATE',
'THIORIDAZINE HCL',
'THIOTHIXENE',
'TRIFLUOPERAZINE HCL',
'CARIPRAZINE HCL',
'OLANZAPINE PAMOATE']

In [5]:
prescriber_df.shape

(25209130, 21)

#### Run query on dataframe for list of drugs (credit: Mary's Queries)

In [6]:
prescriber_df = prescriber_df.query('drug in @list_of_all_antipsychs')

In [18]:
prescriber_df.shape

(509235, 21)

#### Some EDA:
    are there duplicates? there shouldn't be
    notnulls
    how many unique values for each column?
    

In [8]:
prescriber_df_drop_dup = prescriber_df.drop_duplicates()
prescriber_df_drop_dup.shape

(509235, 21)

In [None]:
prescriber_df.total_claim_count_ge65.sum()

In [None]:
prescriber_df.bene_count_ge65.sum()

In [None]:
prescriber_df.notnull().sum()

In [None]:
prescriber_df.nunique()

In [20]:
prescriber_df[prescriber_df['bene_count_ge65'].isnull()]

Unnamed: 0,npi,last_name,first_name,city,state,specialty,description_flag,drug_name,drug,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65


#### We can see ^ that there are 356,420 rows with NaN value for the bene_count_ge65...
    This is because the data is suppressed for any beneficiary count below 11.
    I will fill these NaN values with 5 (a presumed average).

In [9]:
prescriber_df['bene_count_ge65'] = prescriber_df['bene_count_ge65'].fillna(5)

In [11]:
prescriber_df.bene_count_ge65.value_counts()

5.0      356420
0.0      138976
11.0       1955
12.0       1596
13.0       1274
          ...  
303.0         1
367.0         1
162.0         1
219.0         1
509.0         1
Name: bene_count_ge65, Length: 162, dtype: int64

#### Now what are all these zeros?  hmmm

Unnamed: 0,npi,last_name,first_name,city,state,specialty,description_flag,drug_name,drug,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65
26,1659682904,JESUBATHAM,JULIAN,CLARKSDALE,MS,Internal Medicine,S,ZIPRASIDONE HCL,ZIPRASIDONE HCL,,11,11.0,330,554.09,0.0,,0.0,,0.0,0.0,0.0
471,1942466735,PIERRE-LYNCH,NATACHA,HAMMONTON,NJ,Nurse Practitioner,S,QUETIAPINE FUMARATE,QUETIAPINE FUMARATE,11.0,30,30.0,900,817.21,0.0,,0.0,,0.0,0.0,0.0
760,1447337258,CHIANESE,CLAIRE,GEORGETOWN,DE,Nurse Practitioner,S,RISPERIDONE ODT,RISPERIDONE,,13,13.0,388,1600.61,0.0,,0.0,,0.0,0.0,0.0
774,1215978770,WRIGHT,KENNETH,BENTON,AR,Psychiatry,S,PALIPERIDONE ER,PALIPERIDONE,15.0,71,71.9,1798,52158.9,0.0,,0.0,,0.0,0.0,0.0
780,1811985708,HERNANDO,ROBERTO,MIAMI,FL,Psychiatry,S,LATUDA,LURASIDONE HCL,,23,24.0,720,42603.21,0.0,,0.0,,0.0,0.0,0.0


### Below is some more eda that has not materially impacted my presentation yet. 
    eventually, I would like to use the physician compare data to link antipsychotic use with specific hospitals

In [None]:
## confirming that bene_count_ge65 is included in bene_countS
prescriber_df[prescriber_df.bene_count < prescriber_df.bene_count_ge65]

In [None]:
## there are 138,976 rows with '0'for bene_count_ge65
## confirming that these are for rows that have data for beneficiaries younger than 65
prescriber_df[prescriber_df.bene_count_ge65 == 0].head()

In [None]:
total_benef = prescriber_df.groupby(['state'])['bene_count_ge65'].sum().to_frame().sort_values(by= ['bene_count_ge65'],ascending=False)

In [None]:
ak_df = prescriber_df.loc[prescriber_df['state']=='AK']

In [None]:
ak_df.bene_count_ge65.sum()

In [None]:
ak_df.npi.count()

#### Bring in physician data and merge

In [None]:
physician_df = pd.read_csv('../data/archive/Physician_Compare_National_Downloadable_File.csv')
physician_df = physician_df.rename(columns={'NPI': 'npi2','Hospital affiliation CCN 1':'ccn1','Hospital affiliation LBN 1':'lbn1',
                                            'Hospital affiliation CCN 2':'ccn2','Hospital affiliation LBN 2':'lbn2',
                                            'Hospital affiliation CCN 3':'ccn3','Hospital affiliation LBN 3':'lbn3',
                                            'Hospital affiliation CCN 4':'ccn4','Hospital affiliation LBN 4':'lbn4',
                                            'Hospital affiliation CCN 5':'ccn5','Hospital affiliation LBN 5':'lbn5'})

In [None]:
physician_df.shape

In [None]:
physician_df = physician_df.drop_duplicates(subset='npi2')
physician_df.shape

In [None]:
physician_df.npi2.notnull().sum()

In [None]:
physician_df.npi2.nunique()

In [None]:
prescriber_and_physician_df = pd.merge(prescriber_df,physician_df,how= left_on='npi',right_on='npi2')
prescriber_and_physician_df.head(1)

In [None]:
prescriber_and_physician_df.shape

In [None]:
prescriber_and_physician_df = prescriber_and_physician_df.drop_duplicates()

In [None]:
prescriber_and_physician_df.shape

In [None]:
prescriber_and_physician_df.head(2)

In [None]:
prescriber_and_physician_df.shape

In [None]:
prescriber_and_physician_df.to_csv('../data/1.csv', index = False)