<a href="https://colab.research.google.com/github/anju0007/Open-Payments-Database-Analyses/blob/main/Endocrinology%2C%20Diabetes%20%26%20Metabolism/Endocrinology%2C_Diabetes_%26_Metabolism_version_Diabetes_Care_R2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Open Payments Database analysis
# Endocrinology, Diabetes & Metabolism

# Specialty

In [1]:
specialty = "Endocrinology, Diabetes & Metabolism"

# Import package

## import package

In [2]:
# Check Python Version
!python --version

Python 3.8.10


In [3]:
# Check GPU
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [4]:
import os
import glob
import sys
import numpy as np
import numexpr
import pandas as pd
from tqdm import tqdm


## mount Google drive

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Declare path to import and save payment dataset

In [6]:
BASE = "/content/drive/MyDrive/OPD_data out put"

In [7]:
path_out1 = BASE + '/'  + specialty+ "/general/"
path_out2 = BASE + '/' + specialty+ "/research/"
path_out3 = BASE + '/' + specialty+ "/ownership/"
path_out_profile = BASE + '/' + specialty+ "/profile/"
path_out_stata = BASE + '/' + specialty+ "/stata/"

# NPPES specialty taxonomy

In [None]:
special = pd.read_csv("/content/drive/MyDrive/national provider identifier file/nucc_taxonomy_221.csv", 
                   low_memory=False,
                  keep_default_na= "0")

In [None]:
special

Unnamed: 0,Code,Grouping,Classification,Specialization,Definition,Effective Date,Deactivation Date,Last Modified Date,Notes,Display Name,Section
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,10/1/2003,,,[7/1/2003: new],Multi-Specialty Group,Individual
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,10/1/2003,,,[7/1/2003: new],Single Specialty Group,Individual
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,4/1/2003,,7/1/2007,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician,Individual
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,Definition to come...,4/1/2003,,,,Allergy Physician,Individual
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,Definition to come...,4/1/2003,,,,Clinical & Laboratory Immunology (Allergy & Im...,Individual
...,...,...,...,...,...,...,...,...,...,...,...
863,343800000X,Transportation Services,Secured Medical Transport (VAN),,A public or privately owned transportation ser...,4/1/2002,,,,Secured Medical Transport (VAN),Non-Individual
864,344600000X,Transportation Services,Taxi,,A land commercial vehicle used for the transpo...,4/1/2002,,,,Taxi,Non-Individual
865,347D00000X,Transportation Services,Train,,An organization or business licensed to provid...,4/1/2002,,,,Train,Non-Individual
866,347E00000X,Transportation Services,Transportation Broker,,An organization that provides transportation f...,4/1/2002,,1/1/2021,Source: Section 6083 of the Deficit Reduction ...,Transportation Broker,Non-Individual


In [None]:
physician = special.query('Grouping == "Allopathic & Osteopathic Physicians"')
internal = physician[physician['Specialization']== specialty]
spe =list(internal['Code'])
spe

['207RE0101X']

# NPI database extraction

## NPI dataset

In [None]:
usecols = ['NPI', 'Entity Type Code', 'Replacement NPI', 
           'Provider Organization Name (Legal Business Name)', 'Provider Last Name (Legal Name)', 
           'Provider First Name', 'Provider Middle Name', 
           'Provider Other Organization Name', 'Provider Other Organization Name Type Code', 
           'Provider Other Last Name', 'Provider Other First Name', 
           'Provider Other Middle Name',  'Provider Business Practice Location Address City Name', 
           'Provider Business Practice Location Address State Name', 'Provider Enumeration Date', 
           'Last Update Date', 'NPI Deactivation Reason Code', 'NPI Deactivation Date', 
           'NPI Reactivation Date', 'Provider Gender Code', 'Authorized Official Title or Position', 
           'Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1', 
           'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2', 
           'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3', 
           'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4', 
           'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5', 
           'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6', 
           'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7', 
           'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8', 
           'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9', 
           'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
           'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
           'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
           'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
           'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
           'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15', 
           'Certification Date' , 'Provider Business Practice Location Address Postal Code', 
           'Provider Business Practice Location Address Country Code (If outside U.S.)']

In [None]:
npi = pd.read_csv("/content/drive/MyDrive/national provider identifier file/npidata_pfile_20050523-20220710.csv", 
                    chunksize= 1000000, usecols= usecols, low_memory=False,
                    keep_default_na= "0")
npi = pd.concat((r for r in npi), ignore_index=True)
npi = npi[npi["Entity Type Code"] == 1]

In [None]:
npi_specialty = npi[((npi["Healthcare Provider Primary Taxonomy Switch_1"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_1"].isin(spe))) |
                   ((npi["Healthcare Provider Primary Taxonomy Switch_2"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_2"].isin(spe))) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_3"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_3"].isin(spe))) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_4"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_4"].isin(spe))) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_5"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_5"].isin(spe))) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_6"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_6"].isin(spe))) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_7"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_7"].isin(spe) )) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_8"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_8"].isin(spe) )) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_9"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_9"].isin(spe) )) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_10"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_10"].isin(spe) )) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_11"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_11"].isin(spe) )) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_12"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_12"].isin(spe) )) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_13"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_13"].isin(spe) )) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_14"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_14"].isin(spe) )) |
                    ((npi["Healthcare Provider Primary Taxonomy Switch_15"] == "Y" ) & (npi["Healthcare Provider Taxonomy Code_15"].isin(spe) )) ]
del npi

In [None]:
npi_specialty

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Other Organization Name,Provider Other Organization Name Type Code,Provider Other Last Name,...,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,Certification Date
85,1740283779,1.0,,,JAVIER,EMMANUEL,,,,,...,,,,,,,,,,
130,1316940463,1.0,,,DIEZ,JORGE,L.,,,,...,,,,,,,,,,06/18/2021
344,1659374676,1.0,,,PERKINS,VICTORIA,ANNE,,,,...,,,,,,,,,,
658,1942203930,1.0,,,SOTOUDEH,FARANAK,FOROOZANFAR,,,,...,,,,,,,,,,
822,1457354581,1.0,,,CAVALE,ARVIND,RAMACHANDRARAO,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7230783,1861141657,1.0,,,CHURCHILL,JESSIE,,,,,...,,,,,,,,,,06/17/2022
7272235,1528700523,1.0,,,DAVIS,AMALA,,,,,...,,,,,,,,,,07/07/2022
7286104,1679216519,1.0,,,LAND,ANICA,PEROS,,,,...,,,,,,,,,,04/19/2022
7328551,1043958036,1.0,,,LEE,SHARON,,,,,...,,,,,,,,,,05/20/2022


In [None]:
npi_specialty["month"] = npi_specialty["Provider Enumeration Date"].str[:2].astype(int)
npi_specialty["day"] = npi_specialty["Provider Enumeration Date"].str.slice(start=3, stop=5).astype(int)
npi_specialty["year"] = npi_specialty["Provider Enumeration Date"].str.slice(start=6, stop=10).astype(int)

# define January 2000 as month 1
npi_specialty["cmonth"] = ((npi_specialty["year"]-2000)*12 + npi_specialty["month"])
npi_specialty = npi_specialty.query('cmonth < 164')
list_npi = list(npi_specialty["NPI"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npi_specialty["month"] = npi_specialty["Provider Enumeration Date"].str[:2].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npi_specialty["day"] = npi_specialty["Provider Enumeration Date"].str.slice(start=3, stop=5).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npi_specialt

## Merge taxonomy code and specialty name

In [None]:
special = special.fillna("NA")

spe = list(special['Specialization'])

In [None]:
npi_specialty = npi_specialty.assign(cla = "" ,spec = "", note = "")
npi_specialty

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Other Organization Name,Provider Other Organization Name Type Code,Provider Other Last Name,...,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,Certification Date,month,day,year,cmonth,cla,spec,note
85,1740283779,1.0,,,JAVIER,EMMANUEL,,,,,...,,,,5,23,2005,65,,,
130,1316940463,1.0,,,DIEZ,JORGE,L.,,,,...,,,06/18/2021,5,23,2005,65,,,
344,1659374676,1.0,,,PERKINS,VICTORIA,ANNE,,,,...,,,,5,23,2005,65,,,
658,1942203930,1.0,,,SOTOUDEH,FARANAK,FOROOZANFAR,,,,...,,,,5,23,2005,65,,,
822,1457354581,1.0,,,CAVALE,ARVIND,RAMACHANDRARAO,,,,...,,,,5,23,2005,65,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4068265,1265864409,1.0,,,ALEKSIC,SANDRA,,,,,...,,,01/12/2021,7,30,2013,163,,,
4068308,1912349143,1.0,,,SABHA,NOUR,MOHAMMED,,,,...,,,05/10/2022,7,30,2013,163,,,
4068513,1487086625,1.0,,,JALIL,FATIMA,,,,,...,,,11/09/2021,7,30,2013,163,,,
4069338,1538591623,1.0,,,GANESH,MALINI,,,,,...,,,04/23/2021,7,31,2013,163,,,


In [None]:
npi_specialty.groupby(['spec'])['NPI'].count()

spec
    7965
Name: NPI, dtype: int64

## Match NPI with OPD

In [None]:
df_profile = pd.read_csv("/content/drive/MyDrive/OPD/data (22).csv", low_memory=False)

In [None]:
df_profile['covered_recipient_npi'] = df_profile['covered_recipient_npi'].fillna(0)
df_profile['covered_recipient_npi'] = df_profile['covered_recipient_npi'].astype('int64')

In [None]:
df_id = df_profile[df_profile["covered_recipient_npi"].isin(list_npi)] 
df_id

Unnamed: 0,covered_recipient_profile_type,covered_recipient_profile_id,covered_recipient_npi,covered_recipient_profile_first_name,covered_recipient_profile_middle_name,covered_recipient_profile_last_name,covered_recipient_profile_suffix,covered_recipient_profile_alternate_first_name1,covered_recipient_profile_alternate_middle_name1,covered_recipient_profile_alternate_last_name1,...,covered_recipient_profile_license_state_code_5,covered_recipient_profile_alternate_first_name2,covered_recipient_profile_alternate_last_name2,covered_recipient_profile_alternate_first_name3,covered_recipient_profile_alternate_last_name3,covered_recipient_profile_alternate_first_name4,covered_recipient_profile_alternate_last_name4,covered_recipient_profile_alternate_first_name5,covered_recipient_profile_alternate_last_name5,has_multiple_ids
3,Covered Recipient Physician,4,1003024811,LISA,L,HAMAKER,,,,,...,,,,,,,,,,False
37,Covered Recipient Physician,39,1003042169,JACQUELINE,Y,LONIER,,JACQUELINE,YUEY,LONIER,...,,,,JACKIE,LONIER,,,,,False
59,Covered Recipient Physician,64,1003081944,TONSLYN,A,TOURE,,TONSLYN,,TOURE,...,,,,,,,,,,False
182,Covered Recipient Physician,196,1003832437,LAWRENCE,FAYEZ,NESHIWAT,,,,,...,,,,,,,,,,False
193,Covered Recipient Physician,207,1003839101,THOMAS,M,FLOOD,,,,,...,,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077216,Covered Recipient Physician,8798837,1376719617,SHIARA,M,ORTIZ-PUJOLS,,SHIARA,MELISSA,ORTIZ-PUJOLS,...,,,,,,,,,,False
1077948,Covered Recipient Physician,8800626,1427076199,ADY,L.,DJERASSI,,,,,...,,,,,,,,,,False
1087343,Covered Recipient Physician/Covered Recipient ...,9676379,1659516615,JARRA,L,GREEN-CALDERON,,,,,...,,,,,,,,,,False
1122041,Covered Recipient Physician/Covered Recipient ...,10529468,1881642007,CHRISTINA,P,DAVIS,,CHRISTINA,P,COCHRAN,...,,,,,,,,,,False


In [None]:
df_id

Unnamed: 0,covered_recipient_profile_type,covered_recipient_profile_id,covered_recipient_npi,covered_recipient_profile_first_name,covered_recipient_profile_middle_name,covered_recipient_profile_last_name,covered_recipient_profile_suffix,covered_recipient_profile_alternate_first_name1,covered_recipient_profile_alternate_middle_name1,covered_recipient_profile_alternate_last_name1,...,covered_recipient_profile_license_state_code_5,covered_recipient_profile_alternate_first_name2,covered_recipient_profile_alternate_last_name2,covered_recipient_profile_alternate_first_name3,covered_recipient_profile_alternate_last_name3,covered_recipient_profile_alternate_first_name4,covered_recipient_profile_alternate_last_name4,covered_recipient_profile_alternate_first_name5,covered_recipient_profile_alternate_last_name5,has_multiple_ids
3,Covered Recipient Physician,4,1003024811,LISA,L,HAMAKER,,,,,...,,,,,,,,,,False
37,Covered Recipient Physician,39,1003042169,JACQUELINE,Y,LONIER,,JACQUELINE,YUEY,LONIER,...,,,,JACKIE,LONIER,,,,,False
59,Covered Recipient Physician,64,1003081944,TONSLYN,A,TOURE,,TONSLYN,,TOURE,...,,,,,,,,,,False
182,Covered Recipient Physician,196,1003832437,LAWRENCE,FAYEZ,NESHIWAT,,,,,...,,,,,,,,,,False
193,Covered Recipient Physician,207,1003839101,THOMAS,M,FLOOD,,,,,...,,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077216,Covered Recipient Physician,8798837,1376719617,SHIARA,M,ORTIZ-PUJOLS,,SHIARA,MELISSA,ORTIZ-PUJOLS,...,,,,,,,,,,False
1077948,Covered Recipient Physician,8800626,1427076199,ADY,L.,DJERASSI,,,,,...,,,,,,,,,,False
1087343,Covered Recipient Physician/Covered Recipient ...,9676379,1659516615,JARRA,L,GREEN-CALDERON,,,,,...,,,,,,,,,,False
1122041,Covered Recipient Physician/Covered Recipient ...,10529468,1881642007,CHRISTINA,P,DAVIS,,CHRISTINA,P,COCHRAN,...,,,,,,,,,,False


In [None]:
type(df_id['covered_recipient_profile_zipcode'])

pandas.core.series.Series

In [None]:
list_id = list(df_id["covered_recipient_profile_id"])         

## save Profile dataset matching with NPI

In [None]:
npi_specialty.to_csv(path_out_profile + "NPI physician profile data.csv", index=None)
df_id.to_csv(path_out_profile + "matched physician profile data.csv", index=None)

#Delete dataframe

In [None]:
del df_profile

# Extract payment data and save payment dataset

## General payments

### Extract payment data between 2013 and 2021

#### Define file name

In [20]:
list_general2013 = glob.glob("/content/drive/MyDrive/OPD/2013/OP_DTL_GNRL_PGYR*.csv")
list_general2014 = glob.glob("/content/drive/MyDrive/OPD/2014/OP_DTL_GNRL_PGYR*.csv")
list_general2015 = glob.glob("/content/drive/MyDrive/OPD/2015/OP_DTL_GNRL_PGYR*.csv")
list_general2016 = glob.glob("/content/drive/MyDrive/OPD/2016/OP_DTL_GNRL_PGYR*.csv")
list_general2017 = glob.glob("/content/drive/MyDrive/OPD/2017/OP_DTL_GNRL_PGYR*.csv")
list_general2018 = glob.glob("/content/drive/MyDrive/OPD/2018/OP_DTL_GNRL_PGYR*.csv")
list_general2019 = glob.glob("/content/drive/MyDrive/OPD/2019/OP_DTL_GNRL_PGYR*.csv")
list_general2020 = glob.glob("/content/drive/MyDrive/OPD/2020/OP_DTL_GNRL_PGYR*.csv")
list_general2021 = glob.glob("/content/drive/MyDrive/OPD/2021/OP_DTL_GNRL_PGYR*.csv")

#### 2013 data

In [None]:
df_pay13 = pd.read_csv(list_general2013[0], low_memory=False)
list_df_pay_columns13 = list(df_pay13.columns)
df_pay_id13g = pd.DataFrame(columns=list_df_pay_columns13)
df_pay_id13g

Unnamed: 0,Change_Type,Covered_Recipient_Type,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,Physician_Name_Suffix,...,NDC_of_Associated_Covered_Drug_or_Biological3,NDC_of_Associated_Covered_Drug_or_Biological4,NDC_of_Associated_Covered_Drug_or_Biological5,Name_of_Associated_Covered_Device_or_Medical_Supply1,Name_of_Associated_Covered_Device_or_Medical_Supply2,Name_of_Associated_Covered_Device_or_Medical_Supply3,Name_of_Associated_Covered_Device_or_Medical_Supply4,Name_of_Associated_Covered_Device_or_Medical_Supply5,Program_Year,Payment_Publication_Date


In [None]:
for path_data in list_general2013:
    df_payg = pd.read_csv(path_data, low_memory=False)
    df_payg = df_payg[df_payg["Physician_Profile_ID"].isin(list_id)]
    df_payg = df_payg.reset_index(drop=True)
    df_pay_id13g = pd.concat([df_pay_id13g, df_payg], axis=0)
del df_pay13

#### 2014 data

In [None]:
df_pay14 = pd.read_csv(list_general2014[0], low_memory=False)
list_df_pay_columns14 = list(df_pay14.columns)
df_pay_id14g = pd.DataFrame(columns=list_df_pay_columns14)
df_pay_id14g

for path_data in list_general2014:
    df_payg = pd.read_csv(path_data, low_memory=False)
    df_payg = df_payg[df_payg["Physician_Profile_ID"].isin(list_id)]
    df_payg = df_payg.reset_index(drop=True)
    df_pay_id14g = pd.concat([df_pay_id14g, df_payg], axis=0)
del df_pay14

#### 2015 data

In [None]:
df_pay15 = pd.read_csv(list_general2015[0], low_memory=False)
list_df_pay_columns15 = list(df_pay15.columns)
df_pay_id15g = pd.DataFrame(columns=list_df_pay_columns15)
df_pay_id15g

for path_data in list_general2015:
    df_payg = pd.read_csv(path_data, low_memory=False)
    df_payg = df_payg[df_payg["Physician_Profile_ID"].isin(list_id)]
    df_payg = df_payg.reset_index(drop=True)
    df_pay_id15g = pd.concat([df_pay_id15g, df_payg], axis=0)
del df_pay15
del list_df_pay_columns15

#### 2016

In [None]:
df_pay16 = pd.read_csv(list_general2016[0], low_memory=False)
list_df_pay_columns16 = list(df_pay16.columns)
df_pay_id16g = pd.DataFrame(columns=list_df_pay_columns16)
df_pay_id16g

for path_data in list_general2016:
    df_payg = pd.read_csv(path_data, low_memory=False)
    df_payg = df_payg[df_payg["Covered_Recipient_Profile_ID"].isin(list_id)]
    df_payg = df_payg.reset_index(drop=True)
    df_pay_id16g = pd.concat([df_pay_id16g, df_payg], axis=0)
del df_pay16
del list_df_pay_columns16

#### 2017

In [None]:
df_pay17 = pd.read_csv(list_general2017[0], low_memory=False)
list_df_pay_columns17 = list(df_pay17.columns)
df_pay_id17g = pd.DataFrame(columns=list_df_pay_columns17)
df_pay_id17g

for path_data in list_general2017:
    df_payg = pd.read_csv(path_data, low_memory=False)
    df_payg = df_payg[df_payg["Covered_Recipient_Profile_ID"].isin(list_id)]
    df_payg = df_payg.reset_index(drop=True)
    df_pay_id17g = pd.concat([df_pay_id17g, df_payg], axis=0)
del df_pay17
del list_df_pay_columns17

#### 2018

In [None]:
df_pay18 = pd.read_csv(list_general2018[0], low_memory=False)
list_df_pay_columns18 = list(df_pay18.columns)
df_pay_id18g = pd.DataFrame(columns=list_df_pay_columns18)
df_pay_id18g

for path_data in list_general2018:
    df_payg = pd.read_csv(path_data, low_memory=False)
    df_payg = df_payg[df_payg["Covered_Recipient_Profile_ID"].isin(list_id)]
    df_payg = df_payg.reset_index(drop=True)
    df_pay_id18g = pd.concat([df_pay_id18g, df_payg], axis=0)
del df_pay18
del list_df_pay_columns18

#### 2019

In [None]:
df_pay19 = pd.read_csv(list_general2019[0], low_memory=False)
list_df_pay_columns19 = list(df_pay19.columns)
df_pay_id19g = pd.DataFrame(columns=list_df_pay_columns19)

for path_data in list_general2019:
    df_payg = pd.read_csv(path_data, low_memory=False)
    df_payg = df_payg[df_payg["Covered_Recipient_Profile_ID"].isin(list_id)]
    df_payg = df_payg.reset_index(drop=True)
    df_pay_id19g = pd.concat([df_pay_id19g, df_payg], axis=0)
del df_pay19
del list_df_pay_columns19

#### 2020

In [None]:
df_pay20 = pd.read_csv(list_general2020[0], low_memory=False)
list_df_pay_columns20 = list(df_pay20.columns)
df_pay_id20g = pd.DataFrame(columns=list_df_pay_columns20)

for path_data in list_general2020:
    df_payg = pd.read_csv(path_data, low_memory=False)
    df_payg = df_payg[df_payg["Covered_Recipient_Profile_ID"].isin(list_id)]
    df_payg = df_payg.reset_index(drop=True)
    df_pay_id20g = pd.concat([df_pay_id20g, df_payg], axis=0)
del df_pay20
del list_df_pay_columns20

#### 2021

In [None]:
df_pay21 = pd.read_csv(list_general2021[0], low_memory=False)
list_df_pay_columns21 = list(df_pay21.columns)
df_pay_id21g = pd.DataFrame(columns=list_df_pay_columns21)

for path_data in list_general2021:
    df_payg = pd.read_csv(path_data, low_memory=False)
    df_payg = df_payg[df_payg["Covered_Recipient_Profile_ID"].isin(list_id)]
    df_payg = df_payg.reset_index(drop=True)
    df_pay_id21g = pd.concat([df_pay_id21g, df_payg], axis=0)
del df_pay21
del list_df_pay_columns21

df_pay_id21g

Unnamed: 0,Change_Type,Covered_Recipient_Type,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Covered_Recipient_Profile_ID,Covered_Recipient_NPI,Covered_Recipient_First_Name,Covered_Recipient_Middle_Name,Covered_Recipient_Last_Name,...,Associated_Drug_or_Biological_NDC_4,Associated_Device_or_Medical_Supply_PDI_4,Covered_or_Noncovered_Indicator_5,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5,Product_Category_or_Therapeutic_Area_5,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5,Associated_Drug_or_Biological_NDC_5,Associated_Device_or_Medical_Supply_PDI_5,Program_Year,Payment_Publication_Date
0,NEW,Covered Recipient Physician,,,,317531.0,1.003123e+09,GIANLUCA,,IACOBELLIS,...,,,,,,,,,2021,06/30/2022
1,NEW,Covered Recipient Physician,,,,1121472.0,1.629162e+09,Elizabeth,,King,...,,,,,,,,,2021,06/30/2022
2,NEW,Covered Recipient Physician,,,,36160.0,1.801844e+09,Richard,,Sachson,...,,,,,,,,,2021,06/30/2022
3,NEW,Covered Recipient Physician,,,,166619.0,1.689628e+09,Audrey,,Miklius,...,,,,,,,,,2021,06/30/2022
4,NEW,Covered Recipient Physician,,,,1261902.0,1.598080e+09,Jaime,,Wiebel,...,,,,,,,,,2021,06/30/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,NEW,Covered Recipient Physician,,,,302701.0,1.689656e+09,SONIA,A,TALWAR,...,,,,,,,,,2021,06/30/2022
2876,NEW,Covered Recipient Physician,,,,154591.0,1.417161e+09,JENNIFER,C,WHEATON,...,,,,,,,,,2021,06/30/2022
2877,NEW,Covered Recipient Physician,,,,111418.0,1.467481e+09,MEHTAP,,BERKMEN,...,,,,,,,,,2021,06/30/2022
2878,NEW,Covered Recipient Physician,,,,111418.0,1.467481e+09,MEHTAP,,BERKMEN,...,,,,,,,,,2021,06/30/2022


### Combine each dataset to one dataset

In [None]:
df1315g = pd.concat([df_pay_id13g, df_pay_id14g, df_pay_id15g], axis = 0)

del df_pay_id13g
del df_pay_id14g
del df_pay_id15g

In [None]:
df1315g= df1315g.rename(columns={"Physician_Profile_ID":"Covered_Recipient_Profile_ID"})
df1315g= df1315g.rename(columns={"Physician_First_Name":"Covered_Recipient_First_Name"})
df1315g= df1315g.rename(columns={"Physician_Middle_Name":"Covered_Recipient_Middle_Name"})
df1315g= df1315g.rename(columns={"Physician_Last_Name":"Covered_Recipient_Last_Name"})
df1315g= df1315g.rename(columns={"Physician_Name_Suffix":"Covered_Recipient_Name_Suffix"})
df1315g= df1315g.rename(columns={"Physician_Primary_Type":"Covered_Recipient_Primary_Type_1"})
df1315g= df1315g.rename(columns={"Physician_Specialty":"Covered_Recipient_Specialty_1"})
df1315g= df1315g.rename(columns={"Physician_License_State_code1" :"Covered_Recipient_License_State_code1"})
df1315g= df1315g.rename(columns={"Physician_License_State_code2" :"Covered_Recipient_License_State_code2"})
df1315g= df1315g.rename(columns={"Physician_License_State_code3" :"Covered_Recipient_License_State_code3"})
df1315g= df1315g.rename(columns={"Physician_License_State_code4" :"Covered_Recipient_License_State_code4"})
df1315g= df1315g.rename(columns={"Physician_License_State_code5" :"Covered_Recipient_License_State_code5"})

In [None]:
df1315g= df1315g.rename(columns={"Product_Indicator":"Related_Product_Indicator"})
df1315g= df1315g.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological1":"Associated_Drug_or_Biological_NDC_1"})
df1315g= df1315g.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological2":"Associated_Drug_or_Biological_NDC_2"})
df1315g= df1315g.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological3":"Associated_Drug_or_Biological_NDC_3"})
df1315g= df1315g.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological4":"Associated_Drug_or_Biological_NDC_4"})
df1315g= df1315g.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological5":"Associated_Drug_or_Biological_NDC_5"})
df1315g= df1315g.rename(columns = {"Physician_NPI":"Covered_Recipient_NPI"})

In [None]:
df1315g["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1"] = df1315g['Name_of_Associated_Covered_Drug_or_Biological1'].astype(str) 
+ df1315g["Name_of_Associated_Covered_Device_or_Medical_Supply1"]
df1315g["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2"] = df1315g['Name_of_Associated_Covered_Drug_or_Biological2'].astype(str) 
+ df1315g["Name_of_Associated_Covered_Device_or_Medical_Supply2"]
df1315g["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3"] = df1315g['Name_of_Associated_Covered_Drug_or_Biological3'].astype(str) 
+ df1315g["Name_of_Associated_Covered_Device_or_Medical_Supply3"]
df1315g["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4"] = df1315g['Name_of_Associated_Covered_Drug_or_Biological4'].astype(str) 
+ df1315g["Name_of_Associated_Covered_Device_or_Medical_Supply4"]
df1315g["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5"] = df1315g['Name_of_Associated_Covered_Drug_or_Biological5'].astype(str) 
+ df1315g["Name_of_Associated_Covered_Device_or_Medical_Supply5"]

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
22666    NaN
22667    NaN
22668    NaN
22669    NaN
22670    NaN
Name: Name_of_Associated_Covered_Device_or_Medical_Supply5, Length: 744736, dtype: object

In [None]:
df1621g = pd.concat([df_pay_id16g, df_pay_id17g, 
                    df_pay_id18g, df_pay_id19g, 
                    df_pay_id20g, df_pay_id21g], axis = 0)
del df_pay_id16g
del df_pay_id17g
del df_pay_id18g
del df_pay_id19g
del df_pay_id20g
del df_pay_id21g
del df_payg

In [None]:
dfg = pd.concat([df1315g,df1621g], axis = 0)

In [None]:
del df1315g
del df1621g

In [None]:
dfg = dfg.drop(["Name_of_Associated_Covered_Device_or_Medical_Supply1",
             "Name_of_Associated_Covered_Device_or_Medical_Supply2",
             "Name_of_Associated_Covered_Device_or_Medical_Supply3",
             "Name_of_Associated_Covered_Device_or_Medical_Supply4",
             "Name_of_Associated_Covered_Device_or_Medical_Supply5",
             "Name_of_Associated_Covered_Drug_or_Biological1", 
             "Name_of_Associated_Covered_Drug_or_Biological2", 
             "Name_of_Associated_Covered_Drug_or_Biological3", 
             "Name_of_Associated_Covered_Drug_or_Biological4", 
             "Name_of_Associated_Covered_Drug_or_Biological5"], 
             axis=1)
dfg=dfg.replace("nan", "")
dfg=dfg.replace("NaN", "")

In [None]:
dfg.to_csv(path_out1 + "full_general payments dataset2013-2021.csv", index=None)

In [None]:
dfg = dfg[['Covered_Recipient_Profile_ID', 'Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name', 
           'Change_Type', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name',
           'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 'Total_Amount_of_Payment_USDollars', 
           'Date_of_Payment', 'Number_of_Payments_Included_in_Total_Amount', 
           'Nature_of_Payment_or_Transfer_of_Value', 'Dispute_Status_for_Publication', 
           'Associated_Drug_or_Biological_NDC_1', 'Associated_Drug_or_Biological_NDC_2', 
           'Associated_Drug_or_Biological_NDC_3', 'Associated_Drug_or_Biological_NDC_4', 
           'Associated_Drug_or_Biological_NDC_5', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5', 
           'Program_Year', 'Contextual_Information', 
           'Record_ID']]

In [None]:
dfg

Unnamed: 0,Covered_Recipient_Profile_ID,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name,Change_Type,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID,Total_Amount_of_Payment_USDollars,Date_of_Payment,Number_of_Payments_Included_in_Total_Amount,Nature_of_Payment_or_Transfer_of_Value,Dispute_Status_for_Publication,...,Associated_Drug_or_Biological_NDC_4,Associated_Drug_or_Biological_NDC_5,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5,Program_Year,Contextual_Information,Record_ID
0,263236.0,"Forest Laboratories, Inc.",UNCHANGED,"FOREST PHARMACEUTICALS, INC.",100000005529,13.43,10/02/2013,1,Food and Beverage,No,...,,,BYSTOLIC,,,,,2013,,100471904
1,117891.0,"Forest Laboratories, Inc.",UNCHANGED,"FOREST PHARMACEUTICALS, INC.",100000005529,18.86,08/02/2013,1,Education,No,...,,,DALIRESP,,,,,2013,,100427638
2,198071.0,"Forest Laboratories, Inc.",UNCHANGED,"FOREST PHARMACEUTICALS, INC.",100000005529,13.35,08/29/2013,1,Food and Beverage,No,...,,,TUDORZA,DALIRESP,,,,2013,,100379410
3,52956.0,"Forest Laboratories, Inc.",UNCHANGED,"FOREST PHARMACEUTICALS, INC.",100000005529,12.16,11/05/2013,1,Food and Beverage,No,...,,,VIIBRYD,,,,,2013,,100418426
4,327548.0,"Forest Laboratories, Inc.",UNCHANGED,"FOREST PHARMACEUTICALS, INC.",100000005529,78.97,08/02/2013,1,Food and Beverage,No,...,,,TEFLARO,,,,,2013,,100366212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,302701.0,"Galderma Laboratories, L.P.",NEW,"Galderma Laboratories, L.P.",100000010375,85.47,12/04/2021,1,Food and Beverage,No,...,,,,,,,,2021,US,855305147
2876,154591.0,"Galderma Laboratories, L.P.",NEW,"Galderma Laboratories, L.P.",100000010375,13.18,10/21/2021,1,Food and Beverage,No,...,,,ORACEA,AKLIEF,,,,2021,,855308433
2877,111418.0,"Penumbra, Inc.",NEW,"Penumbra, Inc.",100000010583,25.00,01/18/2021,1,Food and Beverage,No,...,,,Indigo System,,,,,2021,,855243571
2878,111418.0,"Penumbra, Inc.",NEW,"Penumbra, Inc.",100000010583,146.85,10/19/2021,1,Food and Beverage,No,...,,,Penumbra System,,,,,2021,,855243573


## Direct research payments

###Extract payment data

#### 2013 research data

In [None]:
df_pay13r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR13_P012221/OP_DTL_RSRCH_PGYR2013_P01222021.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay13r = pd.concat((r for r in df_pay13r), ignore_index=True)

In [None]:
df_pay_id13r = df_pay13r[df_pay13r["Physician_Profile_ID"].isin(list_id)]

#### 2014 payment data

In [None]:
df_pay14r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR14_P012122/OP_DTL_RSRCH_PGYR2014_P01212022.csv", 
                        low_memory=False, chunksize= 200000)
df_pay14r = pd.concat((r for r in df_pay14r), ignore_index=True)

In [None]:
df_pay_id14r = df_pay14r[df_pay14r["Physician_Profile_ID"].isin(list_id)  ]

#### 2015 payment data

In [None]:
df_pay15r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR15_P063022/OP_DTL_RSRCH_PGYR2015_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay15r = pd.concat((r for r in df_pay15r), ignore_index=True)

In [None]:
df_pay_id15r = df_pay15r[df_pay15r["Physician_Profile_ID"].isin(list_id) ]

#### 2016 payment data

In [None]:
df_pay16r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR16_P063022/OP_DTL_RSRCH_PGYR2016_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay16r = pd.concat((r for r in df_pay16r), ignore_index=True)

In [None]:
df_pay_id16r = df_pay16r[df_pay16r["Covered_Recipient_Profile_ID"].isin(list_id) ]

#### 2017 payment data

In [None]:
df_pay17r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR17_P063022/OP_DTL_RSRCH_PGYR2017_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay17r = pd.concat((r for r in df_pay17r), ignore_index=True)

In [None]:
df_pay_id17r = df_pay17r[df_pay17r["Covered_Recipient_Profile_ID"].isin(list_id)  ]

#### 2018 payment data

In [None]:
df_pay18r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR18_P063022/OP_DTL_RSRCH_PGYR2018_P06302022.csv", 
                        low_memory=False, chunksize= 200000)


In [None]:
df_pay18r = pd.concat((r for r in df_pay18r), ignore_index=True)

In [None]:
df_pay_id18r = df_pay18r[df_pay18r["Covered_Recipient_Profile_ID"].isin(list_id)  ]

#### 2019 payment data

In [None]:
df_pay19r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR19_P063022/OP_DTL_RSRCH_PGYR2019_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay19r = pd.concat((r for r in df_pay19r), ignore_index=True)

In [None]:
df_pay_id19r = df_pay19r[df_pay19r["Covered_Recipient_Profile_ID"].isin(list_id)  ]

#### 2020 payment data

In [None]:
df_pay20r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR20_P063022/OP_DTL_RSRCH_PGYR2020_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay20r = pd.concat((r for r in df_pay20r), ignore_index=True)

In [None]:
df_pay_id20r = df_pay20r[df_pay20r["Covered_Recipient_Profile_ID"].isin(list_id) ]

#### 2021 payment data

In [None]:
df_pay21r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR21_P063022/OP_DTL_RSRCH_PGYR2021_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay21r = pd.concat((r for r in df_pay21r), ignore_index=True)

In [None]:
df_pay_id21r = df_pay21r[df_pay21r["Covered_Recipient_Profile_ID"].isin(list_id)  ]

### Rename columns

In [None]:
del df_pay13r
del df_pay14r
del df_pay15r
del df_pay16r
del df_pay17r
del df_pay18r 
del df_pay19r
del df_pay20r
del df_pay21r

In [None]:
df1315r = pd.concat([df_pay_id13r, df_pay_id14r, df_pay_id15r], axis = 0)

In [None]:
df1315r= df1315r.rename(columns={"Physician_Profile_ID":"Covered_Recipient_Profile_ID"})
df1315r= df1315r.rename(columns={"Physician_First_Name":"Covered_Recipient_First_Name"})
df1315r= df1315r.rename(columns={"Physician_Middle_Name":"Covered_Recipient_Middle_Name"})
df1315r= df1315r.rename(columns={"Physician_Last_Name":"Covered_Recipient_Last_Name"})
df1315r= df1315r.rename(columns={"Physician_Name_Suffix":"Covered_Recipient_Name_Suffix"})
df1315r= df1315r.rename(columns={"Physician_Primary_Type":"Covered_Recipient_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Physician_Specialty":"Covered_Recipient_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_1_Primary_Type":"Principal_Investigator_1_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_1_Specialty":"Principal_Investigator_1_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_2_Primary_Type":"Principal_Investigator_2_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_2_Specialty":"Principal_Investigator_2_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_3_Primary_Type":"Principal_Investigator_3_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_3_Specialty":"Principal_Investigator_3_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_4_Primary_Type":"Principal_Investigator_4_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_4_Specialty":"Principal_Investigator_4_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_5_Primary_Type":"Principal_Investigator_5_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_5_Specialty":"Principal_Investigator_5_Specialty_1"})

In [None]:
df1315r= df1315r.rename(columns={"Physician_License_State_code1" :"Covered_Recipient_License_State_code1"})
df1315r= df1315r.rename(columns={"Physician_License_State_code2" :"Covered_Recipient_License_State_code2"})
df1315r= df1315r.rename(columns={"Physician_License_State_code3" :"Covered_Recipient_License_State_code3"})
df1315r= df1315r.rename(columns={"Physician_License_State_code4" :"Covered_Recipient_License_State_code4"})
df1315r= df1315r.rename(columns={"Physician_License_State_code5" :"Covered_Recipient_License_State_code5"})

df1315r= df1315r.rename(columns={"Product_Indicator":"Related_Product_Indicator"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological1":"Associated_Drug_or_Biological_NDC_1"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological2":"Associated_Drug_or_Biological_NDC_2"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological3":"Associated_Drug_or_Biological_NDC_3"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological4":"Associated_Drug_or_Biological_NDC_4"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological5":"Associated_Drug_or_Biological_NDC_5"})
df1315r= df1315r.rename(columns = {"Physician_NPI":"Covered_Recipient_Profile_NPI"})

df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological1'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply1"]
df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological2'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply2"]
df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological3'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply3"]
df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological4'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply4"]
df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological5'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply5"]

48        NaN
68        NaN
252       NaN
464       NaN
473       NaN
         ... 
172078    NaN
172141    NaN
172292    NaN
172398    NaN
172411    NaN
Name: Name_of_Associated_Covered_Device_or_Medical_Supply5, Length: 3987, dtype: object

In [None]:
df1621r = pd.concat([df_pay_id16r, df_pay_id17r, 
                    df_pay_id18r, df_pay_id19r, 
                    df_pay_id20r, df_pay_id21r], axis = 0)

In [None]:
dfrd = pd.concat([df1315r, df1621r])

In [None]:
dfrd = dfrd.drop(["Name_of_Associated_Covered_Device_or_Medical_Supply1",
                  "Name_of_Associated_Covered_Device_or_Medical_Supply2",
                  "Name_of_Associated_Covered_Device_or_Medical_Supply3",
                  "Name_of_Associated_Covered_Device_or_Medical_Supply4",
                  "Name_of_Associated_Covered_Device_or_Medical_Supply5",
                  "Name_of_Associated_Covered_Drug_or_Biological1", 
                  "Name_of_Associated_Covered_Drug_or_Biological2", 
                  "Name_of_Associated_Covered_Drug_or_Biological3", 
                  "Name_of_Associated_Covered_Drug_or_Biological4", 
                  "Name_of_Associated_Covered_Drug_or_Biological5"], 
             axis=1)
dfrd =dfrd.replace("nan", "")
dfrd =dfrd.replace("NaN", "")


In [None]:
del df_pay_id13r
del df_pay_id14r
del df_pay_id15r
del df_pay_id16r
del df_pay_id17r
del df_pay_id18r
del df_pay_id19r
del df_pay_id20r
del df_pay_id21r

In [None]:
del df1315r
del df1621r

In [None]:
dfrd.to_csv(path_out2 + "full_direct research payments dataset2013-2021.csv", index=None)

In [None]:
dfrd = dfrd[['Covered_Recipient_Profile_ID', 
           'Principal_Investigator_1_Profile_ID', 'Principal_Investigator_2_Profile_ID', 
           'Principal_Investigator_3_Profile_ID', 'Principal_Investigator_4_Profile_ID', 
           'Principal_Investigator_5_Profile_ID', 
           'Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name', 
           'Change_Type', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name',
           'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 'Total_Amount_of_Payment_USDollars', 
           'Date_of_Payment', 'Preclinical_Research_Indicator', 
           'Name_of_Study', 'Dispute_Status_for_Publication',
           'Program_Year',  'ClinicalTrials_Gov_Identifier', 
           'Associated_Drug_or_Biological_NDC_1', 'Associated_Drug_or_Biological_NDC_2', 
           'Associated_Drug_or_Biological_NDC_3', 'Associated_Drug_or_Biological_NDC_4', 
           'Associated_Drug_or_Biological_NDC_5', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5',
           'Record_ID' ]]

In [None]:
exclude_d_research = dfrd['Record_ID'].values.tolist()
exclude_d_research

[31509840,
 106168165,
 281409338,
 281315834,
 281350428,
 281380662,
 281381702,
 281382812,
 281340984,
 106041524,
 281329666,
 106042714,
 281347124,
 281409876,
 281336108,
 281352708,
 281411918,
 281393278,
 106048905,
 281340990,
 106042716,
 281343572,
 281400984,
 281409598,
 281410438,
 106042919,
 106048950,
 281350430,
 106044921,
 106099567,
 106111921,
 105889560,
 106035921,
 4805301,
 106042528,
 281337626,
 106056940,
 281323270,
 106042180,
 281381706,
 281403770,
 281347130,
 281405368,
 281352180,
 281409882,
 281411920,
 281393282,
 281336342,
 105768965,
 281339470,
 200637074,
 281333608,
 281409346,
 281352196,
 281354426,
 24448862,
 281341474,
 293382848,
 106058114,
 281410440,
 106044922,
 106168156,
 106061501,
 281352184,
 281354414,
 281403774,
 106042922,
 281400988,
 24470083,
 106111137,
 29631040,
 281327962,
 106168152,
 106167761,
 106168166,
 23928315,
 200652680,
 106045307,
 281354422,
 24454907,
 24449284,
 281412640,
 281410442,
 281354420,
 

## Associated research payment

###Extract payment data

#### 2013 payment data

In [None]:
df_pay13r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR13_P012221/OP_DTL_RSRCH_PGYR2013_P01222021.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay13r = pd.concat((r for r in df_pay13r), ignore_index=True)

In [None]:
df_pay_id13r = df_pay13r[df_pay13r["Principal_Investigator_1_Profile_ID"].isin(list_id) |
                      df_pay13r["Principal_Investigator_2_Profile_ID"].isin(list_id) |
                      df_pay13r["Principal_Investigator_3_Profile_ID"].isin(list_id) |
                      df_pay13r["Principal_Investigator_4_Profile_ID"].isin(list_id) |
                      df_pay13r["Principal_Investigator_5_Profile_ID"].isin(list_id) ]

#### 2014 payment data

In [None]:
df_pay14r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR14_P012122/OP_DTL_RSRCH_PGYR2014_P01212022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay14r = pd.concat((r for r in df_pay14r), ignore_index=True)

In [None]:
df_pay_id14r = df_pay14r[df_pay14r["Principal_Investigator_1_Profile_ID"].isin(list_id) |
                      df_pay14r["Principal_Investigator_2_Profile_ID"].isin(list_id) |
                      df_pay14r["Principal_Investigator_3_Profile_ID"].isin(list_id) |
                      df_pay14r["Principal_Investigator_4_Profile_ID"].isin(list_id) |
                      df_pay14r["Principal_Investigator_5_Profile_ID"].isin(list_id) ]

#### 2015 payment data

In [None]:
df_pay15r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR15_P063022/OP_DTL_RSRCH_PGYR2015_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay15r = pd.concat((r for r in df_pay15r), ignore_index=True)

In [None]:
df_pay_id15r = df_pay15r[df_pay15r["Principal_Investigator_1_Profile_ID"].isin(list_id) |
                      df_pay15r["Principal_Investigator_2_Profile_ID"].isin(list_id) |
                      df_pay15r["Principal_Investigator_3_Profile_ID"].isin(list_id) |
                      df_pay15r["Principal_Investigator_4_Profile_ID"].isin(list_id) |
                      df_pay15r["Principal_Investigator_5_Profile_ID"].isin(list_id) ]

#### 2016 payment data

In [None]:
df_pay16r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR16_P063022/OP_DTL_RSRCH_PGYR2016_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay16r = pd.concat((r for r in df_pay16r), ignore_index=True)

In [None]:
df_pay_id16r = df_pay16r[df_pay16r["Principal_Investigator_1_Profile_ID"].isin(list_id) |
                      df_pay16r["Principal_Investigator_2_Profile_ID"].isin(list_id) |
                      df_pay16r["Principal_Investigator_3_Profile_ID"].isin(list_id) |
                      df_pay16r["Principal_Investigator_4_Profile_ID"].isin(list_id) |
                      df_pay16r["Principal_Investigator_5_Profile_ID"].isin(list_id) ]

#### 2017 payment data

In [None]:
df_pay17r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR17_P063022/OP_DTL_RSRCH_PGYR2017_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay17r = pd.concat((r for r in df_pay17r), ignore_index=True)

In [None]:
df_pay_id17r = df_pay17r[df_pay17r["Principal_Investigator_1_Profile_ID"].isin(list_id) |
                      df_pay17r["Principal_Investigator_2_Profile_ID"].isin(list_id) |
                      df_pay17r["Principal_Investigator_3_Profile_ID"].isin(list_id) |
                      df_pay17r["Principal_Investigator_4_Profile_ID"].isin(list_id) |
                      df_pay17r["Principal_Investigator_5_Profile_ID"].isin(list_id) ]

#### 2018 payment data

In [None]:
df_pay18r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR18_P063022/OP_DTL_RSRCH_PGYR2018_P06302022.csv", 
                        low_memory=False, chunksize= 200000)


In [None]:
df_pay18r = pd.concat((r for r in df_pay18r), ignore_index=True)

In [None]:
df_pay_id18r = df_pay18r[df_pay18r["Principal_Investigator_1_Profile_ID"].isin(list_id) |
                      df_pay18r["Principal_Investigator_2_Profile_ID"].isin(list_id) |
                      df_pay18r["Principal_Investigator_3_Profile_ID"].isin(list_id) |
                      df_pay18r["Principal_Investigator_4_Profile_ID"].isin(list_id) |
                      df_pay18r["Principal_Investigator_5_Profile_ID"].isin(list_id) ]

#### 2019 payment data

In [None]:
df_pay19r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR19_P063022/OP_DTL_RSRCH_PGYR2019_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay19r = pd.concat((r for r in df_pay19r), ignore_index=True)

In [None]:
df_pay_id19r = df_pay19r[df_pay19r["Principal_Investigator_1_Profile_ID"].isin(list_id) |
                      df_pay19r["Principal_Investigator_2_Profile_ID"].isin(list_id) |
                      df_pay19r["Principal_Investigator_3_Profile_ID"].isin(list_id) |
                      df_pay19r["Principal_Investigator_4_Profile_ID"].isin(list_id) |
                      df_pay19r["Principal_Investigator_5_Profile_ID"].isin(list_id) ]

#### 2020 payment data

In [None]:
df_pay20r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR20_P063022/OP_DTL_RSRCH_PGYR2020_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay20r = pd.concat((r for r in df_pay20r), ignore_index=True)

In [None]:
df_pay_id20r = df_pay20r[df_pay20r["Principal_Investigator_1_Profile_ID"].isin(list_id) |
                      df_pay20r["Principal_Investigator_2_Profile_ID"].isin(list_id) |
                      df_pay20r["Principal_Investigator_3_Profile_ID"].isin(list_id) |
                      df_pay20r["Principal_Investigator_4_Profile_ID"].isin(list_id) |
                      df_pay20r["Principal_Investigator_5_Profile_ID"].isin(list_id) ]

#### 2021 payment data

In [None]:
df_pay21r = pd.read_csv("/content/drive/MyDrive/OPD/PGYR21_P063022/OP_DTL_RSRCH_PGYR2021_P06302022.csv", 
                        low_memory=False, chunksize= 200000)

In [None]:
df_pay21r = pd.concat((r for r in df_pay21r), ignore_index=True)

In [None]:
df_pay_id21r = df_pay21r[df_pay21r["Principal_Investigator_1_Profile_ID"].isin(list_id) |
                      df_pay21r["Principal_Investigator_2_Profile_ID"].isin(list_id) |
                      df_pay21r["Principal_Investigator_3_Profile_ID"].isin(list_id) |
                      df_pay21r["Principal_Investigator_4_Profile_ID"].isin(list_id) |
                      df_pay21r["Principal_Investigator_5_Profile_ID"].isin(list_id) ]

### Rename columns

In [None]:
del df_pay13r
del df_pay14r
del df_pay15r
del df_pay16r
del df_pay17r
del df_pay18r 
del df_pay19r
del df_pay20r
del df_pay21r

In [None]:
df1315r = pd.concat([df_pay_id13r, df_pay_id14r, df_pay_id15r], axis = 0)

In [None]:
df1315r= df1315r.rename(columns={"Physician_Profile_ID":"Covered_Recipient_Profile_ID"})
df1315r= df1315r.rename(columns={"Physician_First_Name":"Covered_Recipient_First_Name"})
df1315r= df1315r.rename(columns={"Physician_Middle_Name":"Covered_Recipient_Middle_Name"})
df1315r= df1315r.rename(columns={"Physician_Last_Name":"Covered_Recipient_Last_Name"})
df1315r= df1315r.rename(columns={"Physician_Name_Suffix":"Covered_Recipient_Name_Suffix"})
df1315r= df1315r.rename(columns={"Physician_Primary_Type":"Covered_Recipient_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Physician_Specialty":"Covered_Recipient_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_1_Primary_Type":"Principal_Investigator_1_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_1_Specialty":"Principal_Investigator_1_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_2_Primary_Type":"Principal_Investigator_2_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_2_Specialty":"Principal_Investigator_2_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_3_Primary_Type":"Principal_Investigator_3_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_3_Specialty":"Principal_Investigator_3_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_4_Primary_Type":"Principal_Investigator_4_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_4_Specialty":"Principal_Investigator_4_Specialty_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_5_Primary_Type":"Principal_Investigator_5_Primary_Type_1"})
df1315r= df1315r.rename(columns={"Principal_Investigator_5_Specialty":"Principal_Investigator_5_Specialty_1"})

In [None]:
df1315r= df1315r.rename(columns={"Physician_License_State_code1" :"Covered_Recipient_License_State_code1"})
df1315r= df1315r.rename(columns={"Physician_License_State_code2" :"Covered_Recipient_License_State_code2"})
df1315r= df1315r.rename(columns={"Physician_License_State_code3" :"Covered_Recipient_License_State_code3"})
df1315r= df1315r.rename(columns={"Physician_License_State_code4" :"Covered_Recipient_License_State_code4"})
df1315r= df1315r.rename(columns={"Physician_License_State_code5" :"Covered_Recipient_License_State_code5"})

df1315r= df1315r.rename(columns={"Product_Indicator":"Related_Product_Indicator"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological1":"Associated_Drug_or_Biological_NDC_1"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological2":"Associated_Drug_or_Biological_NDC_2"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological3":"Associated_Drug_or_Biological_NDC_3"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological4":"Associated_Drug_or_Biological_NDC_4"})
df1315r= df1315r.rename(columns={"NDC_of_Associated_Covered_Drug_or_Biological5":"Associated_Drug_or_Biological_NDC_5"})
df1315r= df1315r.rename(columns = {"Physician_NPI":"Covered_Recipient_Profile_NPI"})

df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological1'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply1"]
df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological2'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply2"]
df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological3'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply3"]
df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological4'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply4"]
df1315r["Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5"] = df1315r['Name_of_Associated_Covered_Drug_or_Biological5'].astype(str) 
+ df1315r["Name_of_Associated_Covered_Device_or_Medical_Supply5"]

97        NaN
156       NaN
290       NaN
506       NaN
791       NaN
         ... 
956906    NaN
956916    NaN
956943    NaN
956955    NaN
956957    NaN
Name: Name_of_Associated_Covered_Device_or_Medical_Supply5, Length: 102428, dtype: object

In [None]:
df1621r = pd.concat([df_pay_id16r, df_pay_id17r, 
                    df_pay_id18r, df_pay_id19r, 
                    df_pay_id20r, df_pay_id21r], axis = 0)

In [None]:
dfr = pd.concat([df1315r, df1621r])

In [None]:
dfr = dfr.drop(["Name_of_Associated_Covered_Device_or_Medical_Supply1",
             "Name_of_Associated_Covered_Device_or_Medical_Supply2",
             "Name_of_Associated_Covered_Device_or_Medical_Supply3",
             "Name_of_Associated_Covered_Device_or_Medical_Supply4",
             "Name_of_Associated_Covered_Device_or_Medical_Supply5",
             "Name_of_Associated_Covered_Drug_or_Biological1", 
             "Name_of_Associated_Covered_Drug_or_Biological2", 
             "Name_of_Associated_Covered_Drug_or_Biological3", 
             "Name_of_Associated_Covered_Drug_or_Biological4", 
             "Name_of_Associated_Covered_Drug_or_Biological5"], 
             axis=1)
dfr=dfr.replace("nan", "")
dfr=dfr.replace("NaN", "")


In [None]:
del df_pay_id13r
del df_pay_id14r
del df_pay_id15r
del df_pay_id16r
del df_pay_id17r
del df_pay_id18r
del df_pay_id19r
del df_pay_id20r
del df_pay_id21r

In [None]:
del df1315r
del df1621r

In [None]:
dfr['PI_count']= dfr[['Principal_Investigator_1_Profile_ID', 
           'Principal_Investigator_2_Profile_ID', 
           'Principal_Investigator_3_Profile_ID', 
           'Principal_Investigator_4_Profile_ID', 
           'Principal_Investigator_5_Profile_ID']].count(axis = 1)
           
dfr['per_payment'] = round((dfr['Total_Amount_of_Payment_USDollars'] / dfr['PI_count']),5)
dfr['per_payment'].replace([np.inf, -np.inf], 0, inplace=True)

dfr['PI1'] = dfr['Principal_Investigator_1_Profile_ID'].isin(list_id)
dfr['PI2'] = dfr['Principal_Investigator_2_Profile_ID'].isin(list_id)
dfr['PI3'] = dfr['Principal_Investigator_3_Profile_ID'].isin(list_id)
dfr['PI4'] = dfr['Principal_Investigator_4_Profile_ID'].isin(list_id)
dfr['PI5'] = dfr['Principal_Investigator_5_Profile_ID'].isin(list_id)

dfr['PI_OPDcount'] = dfr.iloc[:,255:260].sum(axis=1)
dfr['PI_OPD_per_payment'] = dfr['PI_OPDcount'] * dfr['per_payment'] 

###Exclude associated research payments which overlap with the direct research payments

In [None]:
dfr2 = dfr
dfr2

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Covered_Recipient_Profile_ID,Covered_Recipient_First_Name,Covered_Recipient_Middle_Name,Covered_Recipient_Last_Name,...,Associated_Device_or_Medical_Supply_PDI_5,PI_count,per_payment,PI1,PI2,PI3,PI4,PI5,PI_OPDcount,PI_OPD_per_payment
97,UNCHANGED,Covered Recipient Teaching Hospital,,70033.0,46.0,DANBURY HOSPITAL,,,,,...,,1,128.7,True,False,False,False,False,1,128.7
156,UNCHANGED,Covered Recipient Teaching Hospital,,374000.0,904.0,State Of Oklahoma,,,,,...,,1,67.5,True,False,False,False,False,1,67.5
290,UNCHANGED,Covered Recipient Teaching Hospital,,450647.0,811.0,Columbia Hospital At Medical City Dallas Subsi...,,,,,...,,1,2500.0,True,False,False,False,False,1,2500.0
506,UNCHANGED,Covered Recipient Teaching Hospital,,300003.0,5.0,Mary Hitchcock Memorial Hospital,,,,,...,,1,447.7,True,False,False,False,False,1,447.7
791,UNCHANGED,Covered Recipient Teaching Hospital,,260162.0,593.0,Barnes Jewish West County Hospital,,,,,...,,1,84.0,True,False,False,False,False,1,84.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672748,NEW,Non-covered Recipient Entity,ST LOUIS UNIV,,,,,,,,...,,1,1005.0,True,False,False,False,False,1,1005.0
672768,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,,,,,,,,...,,1,9.0,True,False,False,False,False,1,9.0
672798,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,,,,,,,,...,,1,5.3,True,False,False,False,False,1,5.3
672800,NEW,Non-covered Recipient Entity,FOUNDATION FOR ADVANCING VETERANS' HEALTH RESE...,,,,,,,,...,,1,56.9,True,False,False,False,False,1,56.9


In [None]:
dfr3 = dfr[dfr['Record_ID'].isin(exclude_d_research)]
dfr3.to_csv(path_out2 + "full_associated research payments dataset2013-2021_exclude.csv", index=None)

In [None]:
dfr = dfr[~dfr['Record_ID'].isin(exclude_d_research)]
dfr

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Covered_Recipient_Profile_ID,Covered_Recipient_First_Name,Covered_Recipient_Middle_Name,Covered_Recipient_Last_Name,...,Associated_Device_or_Medical_Supply_PDI_5,PI_count,per_payment,PI1,PI2,PI3,PI4,PI5,PI_OPDcount,PI_OPD_per_payment
97,UNCHANGED,Covered Recipient Teaching Hospital,,70033.0,46.0,DANBURY HOSPITAL,,,,,...,,1,128.7,True,False,False,False,False,1,128.7
156,UNCHANGED,Covered Recipient Teaching Hospital,,374000.0,904.0,State Of Oklahoma,,,,,...,,1,67.5,True,False,False,False,False,1,67.5
290,UNCHANGED,Covered Recipient Teaching Hospital,,450647.0,811.0,Columbia Hospital At Medical City Dallas Subsi...,,,,,...,,1,2500.0,True,False,False,False,False,1,2500.0
506,UNCHANGED,Covered Recipient Teaching Hospital,,300003.0,5.0,Mary Hitchcock Memorial Hospital,,,,,...,,1,447.7,True,False,False,False,False,1,447.7
791,UNCHANGED,Covered Recipient Teaching Hospital,,260162.0,593.0,Barnes Jewish West County Hospital,,,,,...,,1,84.0,True,False,False,False,False,1,84.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672748,NEW,Non-covered Recipient Entity,ST LOUIS UNIV,,,,,,,,...,,1,1005.0,True,False,False,False,False,1,1005.0
672768,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,,,,,,,,...,,1,9.0,True,False,False,False,False,1,9.0
672798,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,,,,,,,,...,,1,5.3,True,False,False,False,False,1,5.3
672800,NEW,Non-covered Recipient Entity,FOUNDATION FOR ADVANCING VETERANS' HEALTH RESE...,,,,,,,,...,,1,56.9,True,False,False,False,False,1,56.9


In [None]:
dfr.to_csv(path_out2 + "full_associated research payments dataset2013-2021_revision.csv", index=None)

In [None]:
dfr2.to_csv(path_out2 + "full_associated research payments dataset2013-2021_initial.csv", index=None)

In [None]:
dfr = dfr[["Covered_Recipient_Profile_ID", 
           'Principal_Investigator_1_Profile_ID', 'Principal_Investigator_2_Profile_ID', 
           'Principal_Investigator_3_Profile_ID', 'Principal_Investigator_4_Profile_ID', 
           'Principal_Investigator_5_Profile_ID', 
           'Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name', 
           'Change_Type', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name',
           'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 'Total_Amount_of_Payment_USDollars', 
           'Date_of_Payment', 'Preclinical_Research_Indicator', 
           'Name_of_Study', 'Dispute_Status_for_Publication',
           'Program_Year',  'ClinicalTrials_Gov_Identifier', 
           'Record_ID', 
           'Associated_Drug_or_Biological_NDC_1', 'Associated_Drug_or_Biological_NDC_2', 
           'Associated_Drug_or_Biological_NDC_3', 'Associated_Drug_or_Biological_NDC_4', 
           'Associated_Drug_or_Biological_NDC_5', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5', 
           'PI_OPD_per_payment', 
           'per_payment', 'PI_OPDcount'
           ]]

## Ownership payments

In [None]:
df_pay13o = pd.read_csv("/content/drive/MyDrive/OPD/PGYR13_P012221/OP_DTL_OWNRSHP_PGYR2013_P01222021.csv", low_memory=False)
df_pay_id13o = df_pay13o[df_pay13o["Physician_Profile_ID"].isin(list_id)]

df_pay14o = pd.read_csv("/content/drive/MyDrive/OPD/PGYR14_P012122/OP_DTL_OWNRSHP_PGYR2014_P01212022.csv", low_memory=False)
df_pay_id14o = df_pay14o[df_pay14o["Physician_Profile_ID"].isin(list_id)]

df_pay15o = pd.read_csv("/content/drive/MyDrive/OPD/PGYR15_P063022/OP_DTL_OWNRSHP_PGYR2015_P06302022.csv", low_memory=False)
df_pay_id15o = df_pay15o[df_pay15o["Physician_Profile_ID"].isin(list_id)]

df_pay16o = pd.read_csv("/content/drive/MyDrive/OPD/PGYR16_P063022/OP_DTL_OWNRSHP_PGYR2016_P06302022.csv", low_memory=False)
df_pay_id16o = df_pay16o[df_pay16o["Physician_Profile_ID"].isin(list_id)]

df_pay17o = pd.read_csv("/content/drive/MyDrive/OPD/PGYR17_P063022/OP_DTL_OWNRSHP_PGYR2017_P06302022.csv", low_memory=False)
df_pay_id17o = df_pay17o[df_pay17o["Physician_Profile_ID"].isin(list_id)]

df_pay18o = pd.read_csv("/content/drive/MyDrive/OPD/PGYR18_P063022/OP_DTL_OWNRSHP_PGYR2018_P06302022.csv", low_memory=False)
df_pay_id18o = df_pay18o[df_pay18o["Physician_Profile_ID"].isin(list_id)]

df_pay19o = pd.read_csv("/content/drive/MyDrive/OPD/PGYR19_P063022/OP_DTL_OWNRSHP_PGYR2019_P06302022.csv", low_memory=False)
df_pay_id19o = df_pay19o[df_pay19o["Physician_Profile_ID"].isin(list_id)]

df_pay20o = pd.read_csv("/content/drive/MyDrive/OPD/PGYR20_P063022/OP_DTL_OWNRSHP_PGYR2020_P06302022.csv", low_memory=False)
df_pay_id20o = df_pay20o[df_pay20o["Physician_Profile_ID"].isin(list_id)]

df_pay21o = pd.read_csv("/content/drive/MyDrive/OPD/PGYR21_P063022/OP_DTL_OWNRSHP_PGYR2021_P06302022.csv", low_memory=False)
df_pay_id21o = df_pay21o[df_pay21o["Physician_Profile_ID"].isin(list_id)]

### merge ownership interest

In [None]:
dfo = pd.concat([df_pay_id13o, df_pay_id14o ,
                   df_pay_id15o, df_pay_id16o, 
                    df_pay_id17o, df_pay_id18o, 
                    df_pay_id19o, df_pay_id20o, 
                    df_pay_id21o], axis = 0)
dfo
dfo.to_csv(path_out3 + "ownership_interest2013-2021_extract.csv", index=None)

In [None]:
del df_pay_id13o
del df_pay_id14o
del df_pay_id15o
del df_pay_id16o
del df_pay_id17o
del df_pay_id18o
del df_pay_id19o
del df_pay_id20o
del df_pay_id21o

In [None]:
dfo.query('Program_Year == 2017')

Unnamed: 0,Change_Type,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,Physician_Name_Suffix,Recipient_Primary_Business_Street_Address_Line1,Recipient_Primary_Business_Street_Address_Line2,Recipient_City,Recipient_State,...,Terms_of_Interest,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country,Dispute_Status_for_Publication,Interest_Held_by_Physician_or_an_Immediate_Family_Member,Payment_Publication_Date,Physician_NPI
114,ADD,1375341,Christopher,,Martin,,701 Park Avenue,,Minneapolis,MN,...,common stock,"CVRx, Inc.",100000010805,"CVRx, Inc.",MN,United States,No,Immediate family member,06/30/2022,1679850000.0
1192,UNCHANGED,326577,Mark,Windell,TRUE,,914 La Garganta,,San Antonio,TX,...,stock,"SI-BONE, Inc.",100000011102,"SI-BONE, Inc.",CA,United States,No,Immediate family member,06/30/2022,1205983000.0
1199,UNCHANGED,251977,Paul,Malcom,Rudolf,,9110 N Branch Dr,,Bethesda,MD,...,Stock - Reflects the ownership interest of one...,"SI-BONE, Inc.",100000011102,"SI-BONE, Inc.",CA,United States,No,Immediate family member,06/30/2022,1548527000.0
2950,UNCHANGED,2777,Jaime,Abraham,Davidson,,777 Forest Ln,C-204,Dallas,TX,...,Stock Options,"Aspire Bariatrics, Inc.",100000151641,"Aspire Bariatrics, Inc.",PA,United States,No,Physician Covered Recipient,06/30/2022,1053345000.0


In [None]:
dfo.query("Physician_Profile_ID == 334910")

Unnamed: 0,Change_Type,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,Physician_Name_Suffix,Recipient_Primary_Business_Street_Address_Line1,Recipient_Primary_Business_Street_Address_Line2,Recipient_City,Recipient_State,...,Terms_of_Interest,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country,Dispute_Status_for_Publication,Interest_Held_by_Physician_or_an_Immediate_Family_Member,Payment_Publication_Date,Physician_NPI


# Pivot table

## Pivot stata

In [None]:
dfg["month"] = dfg["Date_of_Payment"].str[:2].astype(int)
dfg["day"] = dfg["Date_of_Payment"].str.slice(start=3, stop=5).astype(int)
dfg["year"] = dfg["Date_of_Payment"].str.slice(start=6, stop=10).astype(int)
dfg['cmonth'] = (dfg["year"]-2013) * 12 + dfg["month"] - 7

dfg['category'] = dfg['Nature_of_Payment_or_Transfer_of_Value']

dfg['category'] = dfg['category'].replace(['Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program'] , 
                                            'non-cme_speaking')

dfg['category'] = dfg['category'].replace(['Compensation for serving as faculty or as a speaker for a medical education program',
                                             'Compensation for serving as faculty or as a speaker for a non-accredited and noncertified continuing education program', 
                                             'Compensation for serving as faculty or as a speaker for an accredited or certified continuing education program'],
                                            'cme_speaking')
dfg['category'] = dfg['category'].replace(['Consulting Fee'],
                                            'consulting')

dfg['category'] = dfg['category'].replace(['Education'],
                                            'education')

dfg['category'] = dfg['category'].replace(['Honoraria'],
                                            'honoraria')

dfg['category'] = dfg['category'].replace(['Food and Beverage'],
                                            'meal')

dfg['category'] = dfg['category'].replace(['Gift'],
                                            'gift')

dfg['category'] = dfg['category'].replace(['Travel and Lodging'],
                                            'travel')

dfg['category'] = dfg['category'].replace(['Charitable Contribution',
                                             'Entertainment',
                                             'Grant'],
                                            'other')

dfg['category'] = dfg['category'].replace(['Current or prospective ownership or investment interest'],
                                            'c_ownership')

dfg['category'] = dfg['category'].replace(['Royalty or License'],
                                            'royalty')

dfg['category'] = dfg['category'].replace(['Long term medical supply or device loan'],
                                            'device_loan')

dfgl2 = dfg.query('category != "Acquisitions" & category != "Debt forgiveness" & category != "device_loan"')

dfg_wide_monthl = dfgl2.pivot_table(index ='Covered_Recipient_Profile_ID', 
                                   columns = ['category','cmonth'], 
                                  values= 'Total_Amount_of_Payment_USDollars', 
                                   aggfunc = ['sum'] ).fillna(0)

dfg_wide_monthl.columns = dfg_wide_monthl.columns.droplevel(0)
dfg_wide_monthl.columns = dfg_wide_monthl.columns.map(lambda x: ''.join([str(i) for i in x]))
dfg_wide_monthl  = dfg_wide_monthl.reset_index()

In [None]:
dfg_wide_monthl.to_csv(path_out1 + "category_wide_2013-2021_full.csv", index=None)

In [None]:
dfg_wide_monthlc = dfgl2.pivot_table(index ='Covered_Recipient_Profile_ID', 
                                   columns = ['category','cmonth'], 
                                  values= 'Number_of_Payments_Included_in_Total_Amount', 
                                   aggfunc = ['sum'] ).fillna(0)

dfg_wide_monthlc.columns = dfg_wide_monthlc.columns.droplevel(0)
dfg_wide_monthlc.columns = dfg_wide_monthlc.columns.map(lambda x: ''.join([str(i) for i in x]))
dfg_wide_monthlc  = dfg_wide_monthlc.reset_index()

In [None]:
dfg_wide_monthlc.to_csv(path_out1 + "category_wide_2013-2021_full_case.csv", index=None)

In [None]:
dfg_limit = dfg.query('(Nature_of_Payment_or_Transfer_of_Value != "Acquisitions") \
& (Nature_of_Payment_or_Transfer_of_Value != "Current or prospective ownership or investment interest") & \
(Nature_of_Payment_or_Transfer_of_Value != "Debt forgiveness") & \
 (Nature_of_Payment_or_Transfer_of_Value != "Long term medical supply or device loan" ) & \
 (Nature_of_Payment_or_Transfer_of_Value != "Royalty or License") ')

dfg_wide_month_limit = dfg_limit.pivot_table(index ='Covered_Recipient_Profile_ID', 
                                             columns = 'cmonth',
                                             values= 'Total_Amount_of_Payment_USDollars',
                                             aggfunc = ['sum'] ).fillna(0)

dfg_wide_month_limit.columns = dfg_wide_month_limit.columns.droplevel(0)
dfg_wide_month_limit  = dfg_wide_month_limit.rename_axis(None, axis=1)
dfg_wide_month_limit  = dfg_wide_month_limit.reset_index()
dfg_wide_month_limit.columns

cmonth_list = list(dfg_wide_month_limit.iloc[:, 1:].columns)
dfg_long_limit = pd.melt(dfg_wide_month_limit, id_vars= 'Covered_Recipient_Profile_ID', 
                  value_vars= cmonth_list,
                  value_name='pay', ignore_index=False)
dfg_long_limit.rename({'variable': 'cmonth', 
                       'Covered_Recipient_Profile_ID': 'id'}, axis=1, inplace=True)

dfg_long_limit.to_csv(path_out1 + "general_long_2013-2021_limit.csv", index=None)
del dfg_long_limit
del dfg_wide_month_limit

In [None]:
dfg_wide_monthl.index = dfg_wide_monthl['Covered_Recipient_Profile_ID']
dfg_wide_monthl.drop('Covered_Recipient_Profile_ID', axis=1, inplace=True)
dfg_wide_monthl

Unnamed: 0_level_0,c_ownership10,c_ownership77,c_ownership82,c_ownership94,cme_speaking1,cme_speaking2,cme_speaking3,cme_speaking4,cme_speaking5,cme_speaking7,...,travel92,travel93,travel94,travel95,travel96,travel97,travel98,travel99,travel100,travel101
Covered_Recipient_Profile_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8798837.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8800626.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9676379.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10529468.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
del dfg_wide_monthlc
del dfgl2

## Make pivot tables by physician profile ID

### general payments pivot by physician profile ID

In [None]:
df_general_pivot = dfg.pivot_table(index ='Covered_Recipient_Profile_ID', 
                                   columns = 'Program_Year', 
                                  values= 'Total_Amount_of_Payment_USDollars', 
                                   aggfunc = ['sum'] ).fillna(0)
df_general_pivot['id', 'covered_recipient_profile_id'] = df_general_pivot.index

df_general_pivot = df_general_pivot.reset_index(level=0, drop=True)
df_general_pivot.columns = df_general_pivot.columns.droplevel(0)
df_general_pivot = df_general_pivot.reset_index()
df_general_pivot.drop('index', axis=1, inplace=True)

df_general_pivot.rename(columns = {2013:'general2013', 
                                   2014:'general2014',
                                   2015:'general2015',
                                   2016:'general2016',
                                   2017:'general2017',
                                   2018:'general2018',
                                   2019:'general2019',
                                   2020:'general2020',
                                   2021:'general2021'}
                        , inplace = True)

df_general_pivot.index = df_general_pivot['covered_recipient_profile_id']
df_general_pivot.drop('covered_recipient_profile_id', axis=1, inplace=True)
df_general_pivot

Program_Year,general2013,general2014,general2015,general2016,general2017,general2018,general2019,general2020,general2021
covered_recipient_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4.0,170.12,77.38,295.25,265.49,75.27,158.68,0.00,12.29,26.87
39.0,144.29,4354.38,47.32,22.43,0.00,0.00,140.22,0.00,0.00
64.0,120.59,443.61,1247.85,772.65,37.65,0.00,0.00,0.00,0.00
196.0,2751.44,3394.84,4018.80,4946.66,4374.20,4279.13,5166.89,1025.59,2455.94
207.0,13288.45,26061.42,32002.91,5556.34,936.98,15.96,143.58,0.00,238.51
...,...,...,...,...,...,...,...,...,...
8798837.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,32.27,628.61
8800626.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,355.25,129.53
9676379.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,602.60
10529468.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,131.03


#### save pivot table

In [None]:
df_general_pivot["index"] = df_general_pivot.index

#### Combine pivot table with the NPI profile dataset

In [None]:
df_id.index = df_id['covered_recipient_profile_id']
df_id2 = pd.concat([df_id[['covered_recipient_profile_type', 
                           'covered_recipient_npi', 
                           'covered_recipient_profile_primary_specialty', 
                           'has_multiple_ids']], df_general_pivot],
                        axis = 1)
df_id2

Unnamed: 0_level_0,covered_recipient_profile_type,covered_recipient_npi,covered_recipient_profile_primary_specialty,has_multiple_ids,general2013,general2014,general2015,general2016,general2017,general2018,general2019,general2020,general2021,index
covered_recipient_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
4.0,Covered Recipient Physician,1003024811,Allopathic & Osteopathic Physicians|Internal M...,False,170.12,77.38,295.25,265.49,75.27,158.68,0.00,12.29,26.87,4.0
39.0,Covered Recipient Physician,1003042169,Allopathic & Osteopathic Physicians|Internal M...,False,144.29,4354.38,47.32,22.43,0.00,0.00,140.22,0.00,0.00,39.0
64.0,Covered Recipient Physician,1003081944,Allopathic & Osteopathic Physicians|Internal M...,False,120.59,443.61,1247.85,772.65,37.65,0.00,0.00,0.00,0.00,64.0
196.0,Covered Recipient Physician,1003832437,Allopathic & Osteopathic Physicians|Internal M...,False,2751.44,3394.84,4018.80,4946.66,4374.20,4279.13,5166.89,1025.59,2455.94,196.0
207.0,Covered Recipient Physician,1003839101,Allopathic & Osteopathic Physicians|Internal M...,False,13288.45,26061.42,32002.91,5556.34,936.98,15.96,143.58,0.00,238.51,207.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8798837.0,Covered Recipient Physician,1376719617,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,0.00,32.27,628.61,8798837.0
8800626.0,Covered Recipient Physician,1427076199,,False,0.00,0.00,0.00,0.00,0.00,0.00,0.00,355.25,129.53,8800626.0
9676379.0,Covered Recipient Physician/Covered Recipient ...,1659516615,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,602.60,9676379.0
10529468.0,Covered Recipient Physician/Covered Recipient ...,1881642007,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,131.03,10529468.0


In [None]:
df_general_pivot = dfg.pivot_table(index ='Covered_Recipient_Profile_ID', 
                                   columns = 'Program_Year', 
                                  values= 'Number_of_Payments_Included_in_Total_Amount', 
                                   aggfunc = ['sum'] ).fillna(0)
df_general_pivot['id', 'covered_recipient_profile_id'] = df_general_pivot.index

df_general_pivot = df_general_pivot.reset_index(level=0, drop=True)
df_general_pivot.columns = df_general_pivot.columns.droplevel(0)
df_general_pivot = df_general_pivot.reset_index()
df_general_pivot.drop('index', axis=1, inplace=True)

df_general_pivot.rename(columns = {2013:'generalc2013', 
                                   2014:'generalc2014',
                                   2015:'generalc2015',
                                   2016:'generalc2016',
                                   2017:'generalc2017',
                                   2018:'generalc2018',
                                   2019:'generalc2019',
                                   2020:'generalc2020',
                                   2021:'generalc2021'}
                        , inplace = True)

df_general_pivot.index = df_general_pivot['covered_recipient_profile_id']
df_general_pivot.drop('covered_recipient_profile_id', axis=1, inplace=True)


df_general_pivot["index"] = df_general_pivot.index

df_id2 = pd.concat([df_id2, df_general_pivot],
                        axis = 1)
df_id2

Unnamed: 0_level_0,covered_recipient_profile_type,covered_recipient_npi,covered_recipient_profile_primary_specialty,has_multiple_ids,general2013,general2014,general2015,general2016,general2017,general2018,...,generalc2013,generalc2014,generalc2015,generalc2016,generalc2017,generalc2018,generalc2019,generalc2020,generalc2021,index
covered_recipient_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4.0,Covered Recipient Physician,1003024811,Allopathic & Osteopathic Physicians|Internal M...,False,170.12,77.38,295.25,265.49,75.27,158.68,...,11.0,6.0,8.0,6.0,1.0,6.0,0.0,1.0,2.0,4.0
39.0,Covered Recipient Physician,1003042169,Allopathic & Osteopathic Physicians|Internal M...,False,144.29,4354.38,47.32,22.43,0.00,0.00,...,1.0,5.0,2.0,1.0,0.0,0.0,2.0,0.0,0.0,39.0
64.0,Covered Recipient Physician,1003081944,Allopathic & Osteopathic Physicians|Internal M...,False,120.59,443.61,1247.85,772.65,37.65,0.00,...,7.0,29.0,61.0,26.0,1.0,0.0,0.0,0.0,0.0,64.0
196.0,Covered Recipient Physician,1003832437,Allopathic & Osteopathic Physicians|Internal M...,False,2751.44,3394.84,4018.80,4946.66,4374.20,4279.13,...,74.0,174.0,166.0,239.0,244.0,232.0,187.0,53.0,101.0,196.0
207.0,Covered Recipient Physician,1003839101,Allopathic & Osteopathic Physicians|Internal M...,False,13288.45,26061.42,32002.91,5556.34,936.98,15.96,...,42.0,60.0,46.0,32.0,20.0,1.0,3.0,0.0,2.0,207.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8798837.0,Covered Recipient Physician,1376719617,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18.0,8798837.0
8800626.0,Covered Recipient Physician,1427076199,,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,8.0,8800626.0
9676379.0,Covered Recipient Physician/Covered Recipient ...,1659516615,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,9676379.0
10529468.0,Covered Recipient Physician/Covered Recipient ...,1881642007,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,10529468.0


### Payment categories

In [None]:
df_general_pivotc = dfg.pivot_table(index ='Covered_Recipient_Profile_ID', 
                                   columns = ['category',  'Program_Year'], 
                                  values= 'Total_Amount_of_Payment_USDollars', 
                                   aggfunc = ['sum'] ).fillna(0)
df_general_pivotc['index', 'index', 'covered_recipient_profile_id'] = df_general_pivotc.index


df_general_pivotc.columns = df_general_pivotc.columns.droplevel(0)
df_general_pivotc.columns = df_general_pivotc.columns.map(lambda x: ''.join([str(i) for i in x]))
df_general_pivotc  = df_general_pivotc.reset_index()


df_general_pivotc.drop('indexcovered_recipient_profile_id', axis=1, inplace=True)

df_general_pivotc.index = df_general_pivotc['Covered_Recipient_Profile_ID']
df_general_pivotc.drop('Covered_Recipient_Profile_ID', axis=1, inplace=True)

df_general_pivotc

Unnamed: 0_level_0,Acquisitions2021,Debt forgiveness2021,c_ownership2014,c_ownership2019,c_ownership2020,c_ownership2021,cme_speaking2013,cme_speaking2014,cme_speaking2015,cme_speaking2016,...,royalty2021,travel2013,travel2014,travel2015,travel2016,travel2017,travel2018,travel2019,travel2020,travel2021
Covered_Recipient_Profile_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,3996.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,25.00,0.00,0.00,0.0,0.0,0.0,46.0,0.0,0.0
207.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,816.72,2485.27,1303.67,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8798837.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
8800626.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
9676379.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
10529468.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_id2 = pd.concat([df_id2, df_general_pivotc],
                        axis = 1)
df_id2

Unnamed: 0,covered_recipient_profile_type,covered_recipient_npi,covered_recipient_profile_primary_specialty,has_multiple_ids,general2013,general2014,general2015,general2016,general2017,general2018,...,royalty2021,travel2013,travel2014,travel2015,travel2016,travel2017,travel2018,travel2019,travel2020,travel2021
4.0,Covered Recipient Physician,1003024811,Allopathic & Osteopathic Physicians|Internal M...,False,170.12,77.38,295.25,265.49,75.27,158.68,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
39.0,Covered Recipient Physician,1003042169,Allopathic & Osteopathic Physicians|Internal M...,False,144.29,4354.38,47.32,22.43,0.00,0.00,...,0.0,0.00,3996.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
64.0,Covered Recipient Physician,1003081944,Allopathic & Osteopathic Physicians|Internal M...,False,120.59,443.61,1247.85,772.65,37.65,0.00,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
196.0,Covered Recipient Physician,1003832437,Allopathic & Osteopathic Physicians|Internal M...,False,2751.44,3394.84,4018.80,4946.66,4374.20,4279.13,...,0.0,25.00,0.00,0.00,0.0,0.0,0.0,46.0,0.0,0.0
207.0,Covered Recipient Physician,1003839101,Allopathic & Osteopathic Physicians|Internal M...,False,13288.45,26061.42,32002.91,5556.34,936.98,15.96,...,0.0,816.72,2485.27,1303.67,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8798837.0,Covered Recipient Physician,1376719617,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
8800626.0,Covered Recipient Physician,1427076199,,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
9676379.0,Covered Recipient Physician/Covered Recipient ...,1659516615,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
10529468.0,Covered Recipient Physician/Covered Recipient ...,1881642007,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0


### General payments excluding 2021 categories and ownership

In [None]:
df_general_pivotl = dfg.query('(Nature_of_Payment_or_Transfer_of_Value != "Acquisitions") \
& (Nature_of_Payment_or_Transfer_of_Value != "Current or prospective ownership or investment interest") & \
(Nature_of_Payment_or_Transfer_of_Value != "Debt forgiveness") & \
 (Nature_of_Payment_or_Transfer_of_Value != "Long term medical supply or device loan" ) & \
 (Nature_of_Payment_or_Transfer_of_Value != "Royalty or License") ').pivot_table(index ='Covered_Recipient_Profile_ID', 
                                   columns = 'Program_Year', 
                                  values= 'Total_Amount_of_Payment_USDollars', 
                                   aggfunc = ['sum'] ).fillna(0)
df_general_pivotl['id', 'covered_recipient_profile_id'] = df_general_pivotl.index

df_general_pivotl = df_general_pivotl.reset_index(level=0, drop=True)
df_general_pivotl.columns = df_general_pivotl.columns.droplevel(0)
df_general_pivotl = df_general_pivotl.reset_index()
df_general_pivotl.drop('index', axis=1, inplace=True)

df_general_pivotl.rename(columns = {2013:'generall2013', 
                                   2014:'generall2014',
                                   2015:'generall2015',
                                   2016:'generall2016',
                                   2017:'generall2017',
                                   2018:'generall2018',
                                   2019:'generall2019',
                                   2020:'generall2020',
                                   2021:'generall2021'}
                        , inplace = True)

df_general_pivotl.index = df_general_pivotl['covered_recipient_profile_id']
df_general_pivotl.drop('covered_recipient_profile_id', axis=1, inplace=True)

df_id2 = pd.concat([df_id2, df_general_pivotl],
                        axis = 1)
df_id2

Unnamed: 0,covered_recipient_profile_type,covered_recipient_npi,covered_recipient_profile_primary_specialty,has_multiple_ids,general2013,general2014,general2015,general2016,general2017,general2018,...,travel2021,generall2013,generall2014,generall2015,generall2016,generall2017,generall2018,generall2019,generall2020,generall2021
4.0,Covered Recipient Physician,1003024811,Allopathic & Osteopathic Physicians|Internal M...,False,170.12,77.38,295.25,265.49,75.27,158.68,...,0.0,170.12,77.38,295.25,265.49,75.27,158.68,0.00,12.29,26.87
39.0,Covered Recipient Physician,1003042169,Allopathic & Osteopathic Physicians|Internal M...,False,144.29,4354.38,47.32,22.43,0.00,0.00,...,0.0,144.29,4354.38,47.32,22.43,0.00,0.00,140.22,0.00,0.00
64.0,Covered Recipient Physician,1003081944,Allopathic & Osteopathic Physicians|Internal M...,False,120.59,443.61,1247.85,772.65,37.65,0.00,...,0.0,120.59,443.61,1247.85,772.65,37.65,0.00,0.00,0.00,0.00
196.0,Covered Recipient Physician,1003832437,Allopathic & Osteopathic Physicians|Internal M...,False,2751.44,3394.84,4018.80,4946.66,4374.20,4279.13,...,0.0,2751.44,3394.84,4018.80,4946.66,4374.20,4279.13,5166.89,1025.59,2455.94
207.0,Covered Recipient Physician,1003839101,Allopathic & Osteopathic Physicians|Internal M...,False,13288.45,26061.42,32002.91,5556.34,936.98,15.96,...,0.0,13288.45,26061.42,32002.91,4472.84,936.98,15.96,143.58,0.00,238.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8798837.0,Covered Recipient Physician,1376719617,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,32.27,628.61
8800626.0,Covered Recipient Physician,1427076199,,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,355.25,129.53
9676379.0,Covered Recipient Physician/Covered Recipient ...,1659516615,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,602.60
10529468.0,Covered Recipient Physician/Covered Recipient ...,1881642007,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,131.03


### Associate research payments pivot by physician profile ID

In [None]:
dfr['Total_Amount_of_Payment_USDollars'].sum()

1131152705.74

In [None]:
dfr['Total_Amount_of_Payment_USDollars'].sum()

1131152705.74

In [None]:
dfr['per_payment'].sort_values(ascending=False).sum()

1102661688.5876803

In [None]:
dfr['per_payment'].sort_values(ascending=False).sum()

1102661688.5876803

In [None]:
df_research_pivot1 = dfr.pivot_table(index ='Principal_Investigator_1_Profile_ID', 
                                   columns = 'Program_Year',                                   
                                   values= 'per_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pivot2 = dfr.pivot_table(index ='Principal_Investigator_2_Profile_ID', 
                                   columns = 'Program_Year', 
                                  values= 'per_payment', 
                                   aggfunc = ['sum'] ).fillna(0)

df_research_pivot3 = dfr.pivot_table(index ='Principal_Investigator_3_Profile_ID', 
                                   columns = 'Program_Year', 
                                  values= 'per_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pivot4 = dfr.pivot_table(index ='Principal_Investigator_4_Profile_ID', 
                                   columns = 'Program_Year', 
                                  values= 'per_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pivot5 = dfr.pivot_table(index ='Principal_Investigator_5_Profile_ID', 
                                   columns = 'Program_Year', 
                                  values= 'per_payment', 
                                   aggfunc =['sum']).fillna(0)


In [None]:
df_research_pivot = pd.concat([df_research_pivot1, 
                               df_research_pivot2, 
                               df_research_pivot3, 
                               df_research_pivot4, 
                               df_research_pivot5], axis = 0).fillna(0)
df_research_pivot['id', 'covered_recipient_profile_id'] = df_research_pivot.index     
df_research_pivot = df_research_pivot.reset_index(level=0, drop=True)
df_research_pivot.columns = df_research_pivot.columns.droplevel(0)
df_research_pivot = df_research_pivot.reset_index()
df_research_pivot.drop('index', axis=1, inplace=True)
df_research_pivot.columns
df_research_pivot.rename(columns = {2013:'research2013', 
                                   2014:'research2014',
                                   2015:'research2015',
                                   2016:'research2016',
                                   2017:'research2017',
                                   2018:'research2018',
                                   2019:'research2019',
                                   2020:'research2020',
                                   2021:'research2021'}, inplace = True)
df_research_pivot.sort_values('covered_recipient_profile_id')

Program_Year,research2013,research2014,research2015,research2016,research2017,research2018,research2019,research2020,research2021,covered_recipient_profile_id
1271,3.829626e+07,1.193706e+08,1.425446e+08,1.396515e+08,1.421175e+08,1.274403e+08,1.436919e+08,1.094699e+08,1.219624e+08,0.0
1408,3.862584e+07,1.212054e+08,1.432727e+08,1.401729e+08,1.422244e+08,1.274563e+08,1.456271e+08,1.156599e+08,1.229396e+08,0.0
1449,3.972785e+07,1.221047e+08,1.434755e+08,1.404932e+08,1.424834e+08,1.274563e+08,1.456271e+08,1.156599e+08,1.229396e+08,0.0
1466,3.974317e+07,1.225513e+08,1.434755e+08,1.405230e+08,1.424834e+08,1.274563e+08,1.456271e+08,1.156599e+08,1.229396e+08,0.0
0,0.000000e+00,0.000000e+00,3.079630e+05,1.613939e+06,1.305000e+06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,567.0
...,...,...,...,...,...,...,...,...,...,...
1266,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.483136e+04,0.000000e+00,4211337.0
1267,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.018900e+03,4216790.0
1268,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.400000e+04,1.631900e+04,0.000000e+00,4216796.0
1269,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.050837e+04,5691235.0


In [None]:
df_research_pivot2 = df_research_pivot.pivot_table(index ='covered_recipient_profile_id', 
                                                   values= ['research2013',
                                                            'research2014', 
                                                            'research2015', 
                                                            'research2016', 
                                                            'research2017', 
                                                            'research2018', 
                                                            'research2019', 
                                                            'research2020', 
                                                            'research2021'], 
                                                    aggfunc = ['sum'] ).fillna(0)

df_research_pivot2['id', 'covered_recipient_profile_id'] = df_research_pivot2.index

df_research_pivot2 = df_research_pivot2.reset_index(level=0, drop=True)
df_research_pivot2.columns = df_research_pivot2.columns.droplevel(0)
df_research_pivot2 = df_research_pivot2.reset_index()
df_research_pivot2.drop('index', axis=1, inplace=True)
df_research_pivot2.index = df_research_pivot2['covered_recipient_profile_id']

df_research_pivot2

Program_Year,research2013,research2014,research2015,research2016,research2017,research2018,research2019,research2020,research2021,covered_recipient_profile_id
covered_recipient_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,1.563931e+08,4.852321e+08,5.727682e+08,5.608405e+08,5.693086e+08,5.098092e+08,5.805732e+08,4.564496e+08,4.907812e+08,0.0
567.0,0.000000e+00,0.000000e+00,3.079630e+05,1.613939e+06,1.305000e+06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,567.0
940.0,2.992125e+05,3.354910e+05,2.570907e+05,5.762005e+05,1.336714e+05,1.691239e+05,3.749398e+05,7.109840e+05,7.204339e+04,940.0
1794.0,1.664739e+04,2.564658e+05,2.607976e+05,3.131288e+05,6.410161e+05,4.108642e+05,2.887836e+05,1.066869e+05,6.060949e+04,1794.0
2671.0,0.000000e+00,2.195947e+05,1.336725e+05,4.671556e+04,1.995089e+05,1.753295e+05,2.389084e+04,6.053570e+03,0.000000e+00,2671.0
...,...,...,...,...,...,...,...,...,...,...
4211337.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.483136e+04,0.000000e+00,4211337.0
4216790.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.018900e+03,4216790.0
4216796.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.400000e+04,1.631900e+04,0.000000e+00,4216796.0
5691235.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.050837e+04,5691235.0


In [None]:
df_research_pivot2 = df_research_pivot2[df_research_pivot2["covered_recipient_profile_id"].isin(list_id)] 
df_research_pivot2.drop('covered_recipient_profile_id', axis=1, inplace=True)
df_research_pivot2

df_research_pivot2.to_csv(path_out2 + "pivot2013-2021_extract.csv", index=None)
df_research_pivot2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Program_Year,research2013,research2014,research2015,research2016,research2017,research2018,research2019,research2020,research2021
covered_recipient_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
567.0,0.00,0.00,307963.00,1613939.00,1305000.00,0.00,0.00,0.00,0.00
940.0,299212.46,335490.95,257090.70,576200.47,133671.41,169123.88,374939.84,710984.03,72043.39
1794.0,16647.39,256465.81,260797.57,313128.85,641016.10,410864.22,288783.56,106686.91,60609.49
2671.0,0.00,219594.70,133672.55,46715.56,199508.93,175329.54,23890.84,6053.57,0.00
2777.0,0.00,1779.04,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...
4211337.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,24831.36,0.00
4216790.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,4018.90
4216796.0,0.00,0.00,0.00,0.00,0.00,0.00,24000.00,16319.00,0.00
5691235.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10508.37


In [None]:
df_id3 = pd.concat([df_id2, df_research_pivot2],
                        axis = 1).fillna(0)

### Direct research payments

In [None]:
df_dresearch_pivot = dfrd.pivot_table(index ='Covered_Recipient_Profile_ID', 
                                      columns = 'Program_Year', 
                                      values= 'Total_Amount_of_Payment_USDollars', 
                                      aggfunc = ['sum'] ).fillna(0)
df_dresearch_pivot['id', 'covered_recipient_profile_id'] = df_dresearch_pivot.index

df_dresearch_pivot = df_dresearch_pivot.reset_index(level=0, drop=True)
df_dresearch_pivot.columns = df_dresearch_pivot.columns.droplevel(0)
df_dresearch_pivot = df_dresearch_pivot.reset_index()
df_dresearch_pivot.drop('index', axis=1, inplace=True)

df_dresearch_pivot.rename(columns = {2013:'researchd2013', 
                                     2014:'researchd2014',
                                     2015:'researchd2015',
                                     2016:'researchd2016',
                                     2017:'researchd2017',
                                     2018:'researchd2018',
                                     2019:'researchd2019',
                                     2020:'researchd2020',
                                     2021:'researchd2021'}
                        , inplace = True)

df_dresearch_pivot.index = df_dresearch_pivot['covered_recipient_profile_id']
df_dresearch_pivot.drop('covered_recipient_profile_id', axis=1, inplace=True)

df_dresearch_pivot

Program_Year,researchd2013,researchd2014,researchd2015,researchd2016,researchd2017,researchd2018,researchd2019,researchd2020,researchd2021
covered_recipient_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
940.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,15000.00,0.0
2671.0,0.0,0.0,0.00,1943.24,22273.27,47466.80,4529.00,0.00,0.0
2777.0,2000.0,0.0,3079.00,2638.40,8540.84,6231.45,508.58,0.00,0.0
3954.0,0.0,2600.0,36100.00,21557.00,0.00,0.00,0.00,0.00,0.0
4305.0,0.0,0.0,2300.22,5757.24,79.20,2995.29,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...
1389637.0,0.0,0.0,0.00,1392.25,335.15,0.00,0.00,0.00,0.0
1820061.0,0.0,0.0,0.00,0.00,0.00,761.45,0.00,0.00,0.0
2782156.0,0.0,0.0,0.00,0.00,0.00,0.00,11275.92,24462.97,14963.2
2785202.0,0.0,0.0,0.00,0.00,0.00,0.00,829.61,0.00,0.0


In [None]:
df_id3 = pd.concat([df_id3, df_dresearch_pivot],
                   axis = 1).fillna(0)

In [None]:
df_id3

Unnamed: 0,covered_recipient_profile_type,covered_recipient_npi,covered_recipient_profile_primary_specialty,has_multiple_ids,general2013,general2014,general2015,general2016,general2017,general2018,...,research2021,researchd2013,researchd2014,researchd2015,researchd2016,researchd2017,researchd2018,researchd2019,researchd2020,researchd2021
4.0,Covered Recipient Physician,1003024811,Allopathic & Osteopathic Physicians|Internal M...,False,170.12,77.38,295.25,265.49,75.27,158.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39.0,Covered Recipient Physician,1003042169,Allopathic & Osteopathic Physicians|Internal M...,False,144.29,4354.38,47.32,22.43,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64.0,Covered Recipient Physician,1003081944,Allopathic & Osteopathic Physicians|Internal M...,False,120.59,443.61,1247.85,772.65,37.65,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196.0,Covered Recipient Physician,1003832437,Allopathic & Osteopathic Physicians|Internal M...,False,2751.44,3394.84,4018.80,4946.66,4374.20,4279.13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207.0,Covered Recipient Physician,1003839101,Allopathic & Osteopathic Physicians|Internal M...,False,13288.45,26061.42,32002.91,5556.34,936.98,15.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8798837.0,Covered Recipient Physician,1376719617,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8800626.0,Covered Recipient Physician,1427076199,0,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9676379.0,Covered Recipient Physician/Covered Recipient ...,1659516615,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10529468.0,Covered Recipient Physician/Covered Recipient ...,1881642007,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Pivot table by physician profile ID for ownership payments

In [None]:
dfo.groupby(['Program_Year', 'Terms_of_Interest'])['Value_of_Interest'].agg(sum)

Program_Year  Terms_of_Interest                                                                                                                                                                                                                                                          
2013          Limited partnership interest                                                                                                                                                                                                                                                       653.83
              stock                                                                                                                                                                                                                                                                           945979.00
2014          Limited Partnership Interest                                                                                    

In [None]:
df_ownership_pivot = dfo.pivot_table(index ='Physician_Profile_ID', 
                                   columns = 'Program_Year', 
                                  values= 'Value_of_Interest', 
                                   aggfunc = ['sum']).fillna(0)
df_ownership_pivot["index"] = df_ownership_pivot.index

df_ownership_pivot['id', 'covered_recipient_profile_id'] = df_ownership_pivot.index

df_ownership_pivot = df_ownership_pivot.reset_index(level=0, drop=True)
df_ownership_pivot.columns = df_ownership_pivot.columns.droplevel(0)
df_ownership_pivot = df_ownership_pivot.reset_index()
df_ownership_pivot.drop('index', axis=1, inplace=True)

df_ownership_pivot.rename(columns = {2013:'ownership2013', 
                                   2014:'ownership2014',
                                   2015:'ownership2015',
                                   2016:'ownership2016',
                                   2017:'ownership2017',
                                   2018:'ownership2018',
                                   2019:'ownership2019',
                                   2020:'ownership2020',
                                   2021:'ownership2021'}
                        , inplace = True)
df_ownership_pivot.index =df_ownership_pivot['covered_recipient_profile_id']
df_ownership_pivot.drop('covered_recipient_profile_id', axis=1, inplace=True)

In [None]:
df_ownership_pivot

Program_Year,ownership2013,ownership2014,ownership2015,ownership2016,ownership2017,ownership2018,ownership2019,ownership2020,ownership2021,Unnamed: 10_level_0
covered_recipient_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2777,0.0,0.0,0.0,4906.0,3572.0,5603.0,0.0,0.0,0.0,2777
67861,203.79,15.78,15.78,15.78,0.0,0.0,0.0,0.0,0.0,67861
134322,246.25,8.84,8.84,8.84,0.0,0.0,0.0,0.0,0.0,134322
191630,0.0,0.0,0.0,0.0,0.0,0.0,0.0,358750.0,0.0,191630
204852,0.0,15.78,15.78,15.78,0.0,0.0,0.0,0.0,0.0,204852
227831,203.79,15.78,15.78,15.78,0.0,0.0,0.0,0.0,0.0,227831
246797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,250375.84,246797
251977,739381.0,1764115.0,1825064.0,1339154.0,1240585.28,0.0,0.0,0.0,0.0,251977
273589,0.0,15.78,15.78,15.78,0.0,0.0,0.0,0.0,0.0,273589
324225,0.0,15.78,15.78,15.78,0.0,0.0,0.0,0.0,0.0,324225


### Payments by month

In [None]:
dfg['category'] = dfg['Nature_of_Payment_or_Transfer_of_Value']

dfg['category'] = dfg['category'].replace(['Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program'] , 
                                            'non-cme_speakingm')

dfg['category'] = dfg['category'].replace(['Compensation for serving as faculty or as a speaker for a medical education program',
                                             'Compensation for serving as faculty or as a speaker for a non-accredited and noncertified continuing education program', 
                                             'Compensation for serving as faculty or as a speaker for an accredited or certified continuing education program'],
                                            'cme_speakingm')


dfg['category'] = dfg['category'].replace(['Consulting Fee'],
                                            'consultingm')

dfg['category'] = dfg['category'].replace(['Education'],
                                            'educationm')

dfg['category'] = dfg['category'].replace(['Honoraria'],
                                            'honorariam')

dfg['category'] = dfg['category'].replace(['Food and Beverage'],
                                            'mealm')

dfg['category'] = dfg['category'].replace(['Gift'],
                                            'giftm')

dfg['category'] = dfg['category'].replace(['Travel and Lodging'],
                                            'travelm')

dfg['category'] = dfg['category'].replace(['Charitable Contribution',
                                             'Entertainment',
                                             'Grant'],
                                            'otherm')

dfg['category'] = dfg['category'].replace(['Current or prospective ownership or investment interest'],
                                            'c_ownershipm')

dfg['category'] = dfg['category'].replace(['Royalty or License'],
                                            'royaltym')

dfg['category'] = dfg['category'].replace(['Long term medical supply or device loan'],
                                            'device_loanm')

dfgl2 = dfg.query('category != "Acquisitions" & category != "Debt forgiveness" & category != "device_loan"')

dfg_wide_monthl = dfgl2.pivot_table(index ='Covered_Recipient_Profile_ID', 
                                   columns = ['category','cmonth'], 
                                  values= 'Total_Amount_of_Payment_USDollars', 
                                   aggfunc = ['sum'] ).fillna(0)

dfg_wide_monthl.columns = dfg_wide_monthl.columns.droplevel(0)
dfg_wide_monthl.columns = dfg_wide_monthl.columns.map(lambda x: ''.join([str(i) for i in x]))
dfg_wide_monthl  = dfg_wide_monthl.reset_index()

dfg_wide_monthl.index = dfg_wide_monthl['Covered_Recipient_Profile_ID']
dfg_wide_monthl.drop('Covered_Recipient_Profile_ID', axis=1, inplace=True)
del dfgl2

dfg_wide_monthl

Unnamed: 0_level_0,c_ownershipm10,c_ownershipm77,c_ownershipm82,c_ownershipm94,cme_speakingm1,cme_speakingm2,cme_speakingm3,cme_speakingm4,cme_speakingm5,cme_speakingm7,...,travelm92,travelm93,travelm94,travelm95,travelm96,travelm97,travelm98,travelm99,travelm100,travelm101
Covered_Recipient_Profile_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8798837.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8800626.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9676379.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10529468.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_idm = pd.concat([df_id[['covered_recipient_profile_type', 
                           'covered_recipient_npi', 
                           'covered_recipient_profile_primary_specialty', 
                           'has_multiple_ids']], dfg_wide_monthl],
                        axis = 1)
df_idm

Unnamed: 0,covered_recipient_profile_type,covered_recipient_npi,covered_recipient_profile_primary_specialty,has_multiple_ids,c_ownershipm10,c_ownershipm77,c_ownershipm82,c_ownershipm94,cme_speakingm1,cme_speakingm2,...,travelm92,travelm93,travelm94,travelm95,travelm96,travelm97,travelm98,travelm99,travelm100,travelm101
4.0,Covered Recipient Physician,1003024811,Allopathic & Osteopathic Physicians|Internal M...,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39.0,Covered Recipient Physician,1003042169,Allopathic & Osteopathic Physicians|Internal M...,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64.0,Covered Recipient Physician,1003081944,Allopathic & Osteopathic Physicians|Internal M...,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196.0,Covered Recipient Physician,1003832437,Allopathic & Osteopathic Physicians|Internal M...,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207.0,Covered Recipient Physician,1003839101,Allopathic & Osteopathic Physicians|Internal M...,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8798837.0,Covered Recipient Physician,1376719617,Allopathic & Osteopathic Physicians|Internal M...,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8800626.0,Covered Recipient Physician,1427076199,,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9676379.0,Covered Recipient Physician/Covered Recipient ...,1659516615,Allopathic & Osteopathic Physicians|Internal M...,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10529468.0,Covered Recipient Physician/Covered Recipient ...,1881642007,Allopathic & Osteopathic Physicians|Internal M...,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save profile dataset

In [None]:
npi_specialty.drop(npi_specialty.loc[:,'Healthcare Provider Taxonomy Code_1' : 'Healthcare Provider Primary Taxonomy Switch_15'].columns
                   , axis=1, inplace=True)

In [None]:
df_id4 = pd.concat([df_id3, df_ownership_pivot],
                   axis = 1)

In [None]:
del df_id
del df_id2
del df_id3

In [None]:
df_id4['OPD_ID']= df_id4.index
df_id4.index= df_id4['covered_recipient_npi']
df_id4

Unnamed: 0_level_0,covered_recipient_profile_type,covered_recipient_npi,covered_recipient_profile_primary_specialty,has_multiple_ids,general2013,general2014,general2015,general2016,general2017,general2018,...,ownership2014,ownership2015,ownership2016,ownership2017,ownership2018,ownership2019,ownership2020,ownership2021,Unnamed: 20_level_0,OPD_ID
covered_recipient_npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1003024811,Covered Recipient Physician,1003024811,Allopathic & Osteopathic Physicians|Internal M...,False,170.12,77.38,295.25,265.49,75.27,158.68,...,,,,,,,,,,4.0
1003042169,Covered Recipient Physician,1003042169,Allopathic & Osteopathic Physicians|Internal M...,False,144.29,4354.38,47.32,22.43,0.00,0.00,...,,,,,,,,,,39.0
1003081944,Covered Recipient Physician,1003081944,Allopathic & Osteopathic Physicians|Internal M...,False,120.59,443.61,1247.85,772.65,37.65,0.00,...,,,,,,,,,,64.0
1003832437,Covered Recipient Physician,1003832437,Allopathic & Osteopathic Physicians|Internal M...,False,2751.44,3394.84,4018.80,4946.66,4374.20,4279.13,...,,,,,,,,,,196.0
1003839101,Covered Recipient Physician,1003839101,Allopathic & Osteopathic Physicians|Internal M...,False,13288.45,26061.42,32002.91,5556.34,936.98,15.96,...,,,,,,,,,,207.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1376719617,Covered Recipient Physician,1376719617,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,,,,,,,,,,8798837.0
1427076199,Covered Recipient Physician,1427076199,0,False,0.00,0.00,0.00,0.00,0.00,0.00,...,,,,,,,,,,8800626.0
1659516615,Covered Recipient Physician/Covered Recipient ...,1659516615,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,,,,,,,,,,9676379.0
1881642007,Covered Recipient Physician/Covered Recipient ...,1881642007,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,0.00,0.00,0.00,0.00,...,,,,,,,,,,10529468.0


In [None]:
npi_specialty.index= npi_specialty['NPI']
df_npi_merge = pd.concat([npi_specialty, df_id4],
                         axis = 1).fillna(0)
df_npi_merge

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Other Organization Name,Provider Other Organization Name Type Code,Provider Other Last Name,...,ownership2014,ownership2015,ownership2016,ownership2017,ownership2018,ownership2019,ownership2020,ownership2021,Unnamed: 20,OPD_ID
1003002049,1003002049,1.0,0.0,0.0,SRINIVASAN,LAKSHMI,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1046622.0
1003010943,1003010943,1.0,0.0,0.0,KOSHY,ANOOPA,A.,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1292351.0
1003011800,1003011800,1.0,0.0,0.0,BLOMEIER,HERMAN,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,363237.0
1003013673,1003013673,1.0,0.0,0.0,KRETCHMAN,ERICA,MICHELLE,0.0,0.0,WESTLEY,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136033.0
1003014705,1003014705,1.0,0.0,0.0,JINDAL,ANKUR,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90602.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992980619,1992980619,1.0,0.0,0.0,GO,CHRISTINA,GRACE,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,363145.0
1992988935,1992988935,1.0,0.0,0.0,OMRY ORBACH,GAL,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,181448.0
1992992895,1992992895,1.0,0.0,0.0,HAWKINS,KARA,BRENN,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,363158.0
1992996375,1992996375,1.0,0.0,0.0,CHONGKRAIRATANAKUL,NATSURANG,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45154.0


In [None]:
del df_id4

In [None]:
df_idm['OPD_ID']= df_idm.index
df_idm.index= df_idm['covered_recipient_npi']

df_npi_mergem = pd.concat([npi_specialty, df_idm],
                          axis = 1)
df_npi_mergem

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Other Organization Name,Provider Other Organization Name Type Code,Provider Other Last Name,...,travelm93,travelm94,travelm95,travelm96,travelm97,travelm98,travelm99,travelm100,travelm101,OPD_ID
1003002049,1003002049,1.0,,,SRINIVASAN,LAKSHMI,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,1046622.0
1003010943,1003010943,1.0,,,KOSHY,ANOOPA,A.,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,1292351.0
1003011800,1003011800,1.0,,,BLOMEIER,HERMAN,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,363237.0
1003013673,1003013673,1.0,,,KRETCHMAN,ERICA,MICHELLE,,,WESTLEY,...,0.0,2958.68,220.0,0.0,2616.93,2257.22,1833.68,2045.47,2495.35,136033.0
1003014705,1003014705,1.0,,,JINDAL,ANKUR,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,90602.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992980619,1992980619,1.0,,,GO,CHRISTINA,GRACE,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,363145.0
1992988935,1992988935,1.0,,,OMRY ORBACH,GAL,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,181448.0
1992992895,1992992895,1.0,,,HAWKINS,KARA,BRENN,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,363158.0
1992996375,1992996375,1.0,,,CHONGKRAIRATANAKUL,NATSURANG,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,45154.0


In [None]:
df_npi_mergem

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Other Organization Name,Provider Other Organization Name Type Code,Provider Other Last Name,...,travelm93,travelm94,travelm95,travelm96,travelm97,travelm98,travelm99,travelm100,travelm101,OPD_ID
1003002049,1003002049,1.0,,,SRINIVASAN,LAKSHMI,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,1046622.0
1003010943,1003010943,1.0,,,KOSHY,ANOOPA,A.,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,1292351.0
1003011800,1003011800,1.0,,,BLOMEIER,HERMAN,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,363237.0
1003013673,1003013673,1.0,,,KRETCHMAN,ERICA,MICHELLE,,,WESTLEY,...,0.0,2958.68,220.0,0.0,2616.93,2257.22,1833.68,2045.47,2495.35,136033.0
1003014705,1003014705,1.0,,,JINDAL,ANKUR,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,90602.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992980619,1992980619,1.0,,,GO,CHRISTINA,GRACE,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,363145.0
1992988935,1992988935,1.0,,,OMRY ORBACH,GAL,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,181448.0
1992992895,1992992895,1.0,,,HAWKINS,KARA,BRENN,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,363158.0
1992996375,1992996375,1.0,,,CHONGKRAIRATANAKUL,NATSURANG,,,,,...,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,45154.0


In [None]:
del dfg_wide_monthl

In [None]:
pd.set_option('max_columns', 190)

In [None]:
df_npi_merge

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Other Organization Name,Provider Other Organization Name Type Code,Provider Other Last Name,Provider Other First Name,Provider Other Middle Name,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code,Provider Business Practice Location Address Country Code (If outside U.S.),Provider Enumeration Date,Last Update Date,NPI Deactivation Reason Code,NPI Deactivation Date,NPI Reactivation Date,Provider Gender Code,Authorized Official Title or Position,Certification Date,month,day,year,cmonth,cla,spec,note,covered_recipient_profile_type,covered_recipient_npi,covered_recipient_profile_primary_specialty,has_multiple_ids,general2013,general2014,general2015,general2016,general2017,general2018,general2019,general2020,general2021,index,generalc2013,generalc2014,generalc2015,generalc2016,generalc2017,generalc2018,generalc2019,generalc2020,generalc2021,index.1,Acquisitions2021,Debt forgiveness2021,c_ownership2014,c_ownership2019,c_ownership2020,c_ownership2021,cme_speaking2013,cme_speaking2014,cme_speaking2015,cme_speaking2016,cme_speaking2017,cme_speaking2018,cme_speaking2019,cme_speaking2020,cme_speaking2021,consulting2013,consulting2014,consulting2015,consulting2016,consulting2017,consulting2018,consulting2019,consulting2020,consulting2021,device_loan2021,education2013,education2014,education2015,education2016,education2017,education2018,education2019,education2020,education2021,gift2013,gift2014,gift2015,gift2016,gift2017,gift2018,gift2019,gift2020,gift2021,honoraria2013,honoraria2014,honoraria2015,honoraria2016,honoraria2017,honoraria2018,honoraria2019,honoraria2020,honoraria2021,meal2013,meal2014,meal2015,meal2016,meal2017,meal2018,meal2019,meal2020,meal2021,non-cme_speaking2013,non-cme_speaking2014,non-cme_speaking2015,non-cme_speaking2016,non-cme_speaking2017,non-cme_speaking2018,non-cme_speaking2019,non-cme_speaking2020,non-cme_speaking2021,other2013,other2014,other2015,other2016,other2017,other2018,other2019,other2020,other2021,royalty2013,royalty2014,royalty2015,royalty2016,royalty2017,royalty2018,royalty2019,royalty2020,royalty2021,travel2013,travel2014,travel2015,travel2016,travel2017,travel2018,travel2019,travel2020,travel2021,generall2013,generall2014,generall2015,generall2016,generall2017,generall2018,generall2019,generall2020,generall2021,research2013,research2014,research2015,research2016,research2017,research2018,research2019,research2020,research2021,researchd2013,researchd2014,researchd2015,researchd2016,researchd2017,researchd2018,researchd2019,researchd2020,researchd2021,ownership2013,ownership2014,ownership2015,ownership2016,ownership2017,ownership2018,ownership2019,ownership2020,ownership2021,Unnamed: 189,OPD_ID
1003002049,1003002049,1.0,0.0,0.0,SRINIVASAN,LAKSHMI,0,0.0,0.0,0,0,0,FREMONT,CA,945382299,US,09/24/2007,05/27/2020,0.0,0,0,F,0.0,05/27/2020,9,24,2007,93,0.0,0.0,0.0,Covered Recipient Physician,1.003002e+09,Allopathic & Osteopathic Physicians|Internal M...,False,57.00,0.00,37.25,45.92,0.00,13.04,0.00,20.61,0.00,1046622.0,2.0,0.0,3.0,4.0,0.0,1.0,0.0,1.0,0.0,1046622.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,57.00,0.00,0.00,13.90,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,37.25,32.02,0.00,13.04,0.00,20.61,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,57.00,0.00,37.25,45.92,0.00,13.04,0.00,20.61,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1046622.0
1003010943,1003010943,1.0,0.0,0.0,KOSHY,ANOOPA,A.,0.0,0.0,0,0,0,CHICAGO,IL,606113370,US,06/13/2007,05/10/2018,0.0,0,0,F,0.0,0,6,13,2007,90,0.0,0.0,0.0,Covered Recipient Physician,1.003011e+09,Allopathic & Osteopathic Physicians|Internal M...,False,0.00,0.00,25.20,0.00,0.00,0.00,79.86,0.00,501.80,1292351.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,20.0,1292351.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,25.20,0.00,0.00,0.00,79.86,0.00,501.80,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,25.20,0.00,0.00,0.00,79.86,0.00,501.80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1292351.0
1003011800,1003011800,1.0,0.0,0.0,BLOMEIER,HERMAN,0,0.0,0.0,0,0,0,SKOKIE,IL,600771057,US,06/20/2007,02/10/2021,0.0,0,0,M,0.0,02/10/2021,6,20,2007,90,0.0,0.0,0.0,Covered Recipient Physician,1.003012e+09,Allopathic & Osteopathic Physicians|Internal M...,False,1151.12,2731.51,117.88,0.00,0.00,18.92,0.00,101.53,0.00,363237.0,3.0,10.0,5.0,0.0,0.0,1.0,0.0,1.0,0.0,363237.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,15.00,30.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.12,434.75,87.88,0.00,0.00,18.92,0.00,101.53,0.00,1125.0,1125.0,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1156.76,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1151.12,2731.51,117.88,0.00,0.00,18.92,0.00,101.53,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,363237.0
1003013673,1003013673,1.0,0.0,0.0,KRETCHMAN,ERICA,MICHELLE,0.0,0.0,WESTLEY,ERICA,MICHELLE,RICHMOND,IN,473741155,US,06/28/2007,07/08/2022,0.0,0,0,F,0.0,07/08/2022,6,28,2007,90,0.0,0.0,0.0,Covered Recipient Physician,1.003014e+09,Allopathic & Osteopathic Physicians|Internal M...,False,454.39,3674.31,7440.92,20482.52,34545.67,57622.12,56531.08,90933.38,121285.44,136033.0,34.0,181.0,227.0,258.0,301.0,338.0,302.0,267.0,347.0,136033.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9641.25,8851.0,0.0,0.0,0.0,0.0,0.0,893.75,2925.0,0.0,0.0,0.0,35.88,80.80,728.36,42.94,29.02,29.35,14.35,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,418.51,2218.78,2555.84,2874.67,3085.59,3663.54,3751.76,3622.62,4115.33,0.0,0.0,3421.34,16492.46,30177.19,47206.89,44725.0,74713.5,93872.5,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1374.73,735.38,1072.45,1253.87,5828.59,5114.97,2956.01,14446.61,454.39,3674.31,7440.92,20482.52,34545.67,57622.12,56531.08,90933.38,121285.44,0.0,0.0,0.0,0.0,0.0,0.0,4000.0,6490.00,5160.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136033.0
1003014705,1003014705,1.0,0.0,0.0,JINDAL,ANKUR,0,0.0,0.0,0,0,0,HUNTSVILLE,AL,358015134,US,07/05/2007,04/03/2017,0.0,0,0,M,0.0,0,7,5,2007,91,0.0,0.0,0.0,Covered Recipient Physician,1.003015e+09,Allopathic & Osteopathic Physicians|Internal M...,False,90.84,0.00,560.59,19.38,234.48,530.62,742.76,392.94,403.88,90602.0,2.0,0.0,6.0,1.0,16.0,36.0,51.0,27.0,29.0,90602.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.00,93.05,0.00,0.00,0.00,0.00,12.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.84,0.00,51.87,19.38,234.48,530.62,742.76,379.95,403.88,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,415.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,90.84,0.00,560.59,19.38,234.48,530.62,742.76,392.94,403.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90602.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992980619,1992980619,1.0,0.0,0.0,GO,CHRISTINA,GRACE,0.0,0.0,0,0,0,ARLINGTON,VA,222053683,US,12/31/2007,01/24/2014,0.0,0,0,F,0.0,0,12,31,2007,96,0.0,0.0,0.0,Covered Recipient Physician,1.992981e+09,Allopathic & Osteopathic Physicians|Internal M...,False,440.43,1584.34,1180.23,428.74,428.41,652.18,421.07,89.64,420.73,363145.0,30.0,80.0,46.0,26.0,23.0,35.0,24.0,5.0,22.0,363145.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,125.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,2.04,0.43,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,438.39,1583.91,1055.23,428.74,428.41,652.18,421.07,89.64,420.73,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,440.43,1584.34,1180.23,428.74,428.41,652.18,421.07,89.64,420.73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,363145.0
1992988935,1992988935,1.0,0.0,0.0,OMRY ORBACH,GAL,0,0.0,0.0,0,0,0,SEATTLE,WA,981012756,US,12/13/2007,07/15/2013,0.0,0,0,F,0.0,0,12,13,2007,96,0.0,0.0,0.0,Covered Recipient Physician,1.992989e+09,Allopathic & Osteopathic Physicians|Internal M...,False,40.55,0.00,160.65,0.00,0.00,0.00,0.00,0.00,0.00,181448.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,181448.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.55,0.00,160.65,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,40.55,0.00,160.65,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,181448.0
1992992895,1992992895,1.0,0.0,0.0,HAWKINS,KARA,BRENN,0.0,0.0,0,0,0,CHARLOTTESVILLE,VA,22911,US,09/25/2007,07/03/2018,0.0,0,0,F,0.0,0,9,25,2007,93,0.0,0.0,0.0,Covered Recipient Physician,1.992993e+09,Allopathic & Osteopathic Physicians|Internal M...,False,10.92,0.00,104.91,156.42,256.04,977.81,336.64,6125.69,2040.00,363158.0,1.0,0.0,5.0,5.0,5.0,9.0,5.0,6.0,2.0,363158.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,5780.0,2040.0,0.0,0.00,0.00,15.21,32.27,6.61,0.00,0.00,240.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.92,0.00,89.70,124.15,249.43,293.18,336.64,105.15,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,684.63,0.00,0.00,0.00,10.92,0.00,104.91,156.42,256.04,977.81,336.64,6125.69,2040.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,363158.0
1992996375,1992996375,1.0,0.0,0.0,CHONGKRAIRATANAKUL,NATSURANG,0,0.0,0.0,0,0,0,ESCONDIDO,CA,920256428,US,08/09/2007,04/16/2021,0.0,0,0,F,0.0,04/16/2021,8,9,2007,92,0.0,0.0,0.0,Covered Recipient Physician,1.992996e+09,Allopathic & Osteopathic Physicians|Internal M...,False,190.16,1590.14,659.30,575.34,252.11,23.54,309.33,69.81,0.00,45154.0,4.0,36.0,31.0,35.0,9.0,1.0,15.0,4.0,0.0,45154.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,84.15,0.00,103.99,0.00,229.18,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106.01,581.41,555.31,575.34,22.93,23.54,309.33,69.81,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1008.73,0.00,0.00,0.00,0.00,0.00,0.00,0.00,190.16,1590.14,659.30,575.34,252.11,23.54,309.33,69.81,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45154.0


In [None]:
df_npi_merge.to_csv(path_out_profile + "npi_dataset_payment2013-2021.csv", index=None)

In [None]:
df_npi_mergem.to_csv(path_out_profile + "npi_dataset_payment2013-2021month.csv", index=None)

In [None]:
del df_dresearch_pivot
del df_general_pivot
del df_general_pivotc
del df_general_pivotl
del df_research_pivot
del df_research_pivot1
del df_research_pivot2
del df_research_pivot3
del df_research_pivot4
del df_research_pivot5
del df_ownership_pivot

## Pivot table by pharmaceutical companies

### Pivot table by company for associate research payments

In [None]:
dfr.loc[dfr["Principal_Investigator_1_Profile_ID"].isin(list_id),  "PI1"]=1
dfr.loc[~dfr["Principal_Investigator_1_Profile_ID"].isin(list_id),  "PI1"]=0

dfr.loc[dfr["Principal_Investigator_2_Profile_ID"].isin(list_id),  "PI2"]=1
dfr.loc[~dfr["Principal_Investigator_2_Profile_ID"].isin(list_id),  "PI2"]=0

dfr.loc[dfr["Principal_Investigator_3_Profile_ID"].isin(list_id),  "PI3"]=1
dfr.loc[~dfr["Principal_Investigator_3_Profile_ID"].isin(list_id),  "PI3"]=0

dfr.loc[dfr["Principal_Investigator_4_Profile_ID"].isin(list_id),  "PI4"]=1
dfr.loc[~dfr["Principal_Investigator_4_Profile_ID"].isin(list_id),  "PI4"]=0

dfr.loc[dfr["Principal_Investigator_5_Profile_ID"].isin(list_id),  "PI5"]=1
dfr.loc[~dfr["Principal_Investigator_5_Profile_ID"].isin(list_id),  "PI5"]=0

dfr["PI"] = dfr["PI1"] +dfr["PI2"] +dfr["PI3"] +dfr["PI4"] +dfr["PI5"]

In [None]:
dfr["payment_DMdoc"] = dfr['per_payment'] *dfr["PI"]

In [None]:
df_pharmaryr = dfr.pivot_table(index ='Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 
                             columns = 'Program_Year', 
                             values= 'PI_OPD_per_payment', 
                             aggfunc = ['sum']).fillna(0)
df_pharmaryr["index"] = df_pharmaryr.index

In [None]:
df_pharmar = dfr.pivot_table(index ='Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 
                             values= 'PI_OPD_per_payment', 
                             aggfunc = ['sum']).fillna(0)
df_pharmar["index", 'ID'] = df_pharmar.index
df_pharmar = df_pharmar.reset_index(level=0, drop=True)
df_pharmar.columns = df_pharmar.columns.droplevel(0)
df_pharmar = df_pharmar.reset_index()
df_pharmar.drop('index', axis=1, inplace=True)
df_pharmar = df_pharmar.rename(columns={'PI_OPD_per_payment': 'Associate research'})
df_pharmar.index = df_pharmar['ID']
df_pharmar.drop('ID', axis=1, inplace=True)
df_pharmar

Unnamed: 0_level_0,Associate research
ID,Unnamed: 1_level_1
100000000053,4.928598e+07
100000000058,4.853340e+03
100000000062,2.161170e+03
100000000066,2.651467e+06
100000000067,1.276298e+08
...,...
100000801820,4.052987e+05
100000806824,3.441030e+03
100000961851,9.271682e+04
100000966832,5.602500e+03


### Pivot table by company for direct research payments

In [None]:
df_pharmard = dfrd.pivot_table(index ='Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 
                              values= 'Total_Amount_of_Payment_USDollars', 
                              aggfunc = ['sum']).fillna(0)

df_pharmard["index", "ID"] = df_pharmard.index
df_pharmard = df_pharmard.reset_index(level=0, drop=True)
df_pharmard.columns = df_pharmard.columns.droplevel(0)
df_pharmard = df_pharmard.reset_index()
df_pharmard.drop('index', axis=1, inplace=True)
df_pharmard = df_pharmard.rename(columns={'Total_Amount_of_Payment_USDollars': 'Direct research'})

df_pharmard.index = df_pharmard['ID']

df_pharmard.drop('ID', axis=1, inplace=True)
df_pharmard

Unnamed: 0_level_0,Direct research
ID,Unnamed: 1_level_1
100000000053,1662052.31
100000000056,27269.00
100000000058,1462.00
100000000066,2953.25
100000000067,3035428.29
...,...
100000291822,31590.00
100000316825,2300.00
100000576811,10669.73
100000801820,42908.36


### Pivot table by company for general payments

In [None]:
df_pharmagyr = dfg.pivot_table(index ='Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 
                               columns = 'Program_Year', 
                               values= 'Total_Amount_of_Payment_USDollars', 
                               aggfunc = ['sum']).fillna(0)
df_pharmagyr["index"] = df_pharmagyr.index

In [None]:
df_pharmag = dfg.pivot_table(index ='Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 
                                   columns = 'Nature_of_Payment_or_Transfer_of_Value', 
                                  values= 'Total_Amount_of_Payment_USDollars', 
                                   aggfunc = ['sum']).fillna(0)
df_pharmag["index", "ID"] = df_pharmag.index
df_pharmag = df_pharmag.reset_index(level=0, drop=True)
df_pharmag.columns = df_pharmag.columns.droplevel(0)
df_pharmag= df_pharmag.reset_index()
df_pharmag.drop('index', axis=1, inplace=True)
df_pharmag.index = df_pharmag['ID']

df_pharmag.drop('ID', axis=1, inplace=True)
df_pharmag

Nature_of_Payment_or_Transfer_of_Value,Acquisitions,Charitable Contribution,"Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program",Compensation for serving as faculty or as a speaker for a medical education program,Compensation for serving as faculty or as a speaker for a non-accredited and noncertified continuing education program,Compensation for serving as faculty or as a speaker for an accredited or certified continuing education program,Consulting Fee,Current or prospective ownership or investment interest,Debt forgiveness,Education,Entertainment,Food and Beverage,Gift,Grant,Honoraria,Long term medical supply or device loan,Royalty or License,Travel and Lodging
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
100000000053,0.0,0.0,10726701.73,0.0,6752.0,0.0,2203603.6,0.0,0.0,86525.27,0.0,1223878.03,0.00,5000.0,0.0,0.0,0.0,2786599.55
100000000055,0.0,0.0,0.00,0.0,0.0,0.0,14675.0,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.00
100000000056,0.0,0.0,0.00,0.0,0.0,0.0,11715.0,0.0,0.0,0.00,0.0,747.32,282.17,0.0,0.0,0.0,0.0,20715.23
100000000057,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,19.91,0.00,0.0,0.0,0.0,0.0,0.00
100000000058,0.0,0.0,0.00,0.0,0.0,0.0,18500.0,0.0,0.0,0.00,0.0,91.81,0.00,0.0,0.0,0.0,0.0,1046.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100000966856,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,18.86,0.00,0.0,0.0,0.0,0.0,0.00
100000971834,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,8.74,0.00,0.0,0.0,0.0,0.0,0.00
100000971837,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,53.68,0.00,0.0,0.0,0.0,0.0,0.00
100000971862,0.0,0.0,0.00,0.0,0.0,0.0,300.0,0.0,0.0,0.00,0.0,1671.14,0.00,0.0,0.0,0.0,0.0,0.00


### Pivot tables by company for ownership payments

In [None]:
df_pharmao = dfo.pivot_table(index ='Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 
                                  values= 'Total_Amount_Invested_USDollars', 
                                   aggfunc = ['sum']).fillna(0)
df_pharmao["index", 'ID'] = df_pharmao.index
df_pharmao = df_pharmao.reset_index(level=0, drop=True)
df_pharmao.columns = df_pharmao.columns.droplevel(0)
df_pharmao = df_pharmao.reset_index()
df_pharmao.drop('index', axis=1, inplace=True)

df_pharmao = df_pharmao.rename(columns={'Total_Amount_Invested_USDollars': 'Investment and ownership'})
df_pharmao.index = df_pharmao['ID']
df_pharmao.drop('ID', axis=1, inplace=True)
df_pharmao

Unnamed: 0_level_0,Investment and ownership
ID,Unnamed: 1_level_1
100000000247,116250.0
100000010805,0.0
100000010809,49.54
100000011102,1904142.09
100000151641,0.0
100000316816,0.0
100000966862,0.0


### save pivot for pharma

In [None]:
df_pharma = pd.concat([df_pharmag, 
                       df_pharmar, 
                       df_pharmard, 
                       df_pharmao],
                        axis = 1).fillna(0)
df_pharma['ID']= df_pharma.index
df_pharma

Unnamed: 0_level_0,Acquisitions,Charitable Contribution,"Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program",Compensation for serving as faculty or as a speaker for a medical education program,Compensation for serving as faculty or as a speaker for a non-accredited and noncertified continuing education program,Compensation for serving as faculty or as a speaker for an accredited or certified continuing education program,Consulting Fee,Current or prospective ownership or investment interest,Debt forgiveness,Education,Entertainment,Food and Beverage,Gift,Grant,Honoraria,Long term medical supply or device loan,Royalty or License,Travel and Lodging,Associate research,Direct research,Investment and ownership,ID
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
100000000053,0.0,0.0,10726701.73,0.0,6752.0,0.0,2203603.6,0.0,0.0,86525.27,0.0,1223878.03,0.00,5000.0,0.0,0.0,0.0,2786599.55,49285983.15,1662052.31,0.0,100000000053
100000000055,0.0,0.0,0.00,0.0,0.0,0.0,14675.0,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,100000000055
100000000056,0.0,0.0,0.00,0.0,0.0,0.0,11715.0,0.0,0.0,0.00,0.0,747.32,282.17,0.0,0.0,0.0,0.0,20715.23,0.00,27269.00,0.0,100000000056
100000000057,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,19.91,0.00,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,100000000057
100000000058,0.0,0.0,0.00,0.0,0.0,0.0,18500.0,0.0,0.0,0.00,0.0,91.81,0.00,0.0,0.0,0.0,0.0,1046.35,4853.34,1462.00,0.0,100000000058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100000966862,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.00,44691.41,0.00,0.0,100000966862
100000971834,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,8.74,0.00,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,100000971834
100000971837,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,53.68,0.00,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,100000971837
100000971862,0.0,0.0,0.00,0.0,0.0,0.0,300.0,0.0,0.0,0.00,0.0,1671.14,0.00,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,100000971862


In [None]:
df_pharma.to_csv(path_out_profile + "pivot_pay_pharma.csv", index=None)

### Company payment by submitting companies 

In [None]:
dfr.loc[dfr["Principal_Investigator_1_Profile_ID"].isin(list_id),  "PI1"]=1
dfr.loc[~dfr["Principal_Investigator_1_Profile_ID"].isin(list_id),  "PI1"]=0

dfr.loc[dfr["Principal_Investigator_2_Profile_ID"].isin(list_id),  "PI2"]=1
dfr.loc[~dfr["Principal_Investigator_2_Profile_ID"].isin(list_id),  "PI2"]=0

dfr.loc[dfr["Principal_Investigator_3_Profile_ID"].isin(list_id),  "PI3"]=1
dfr.loc[~dfr["Principal_Investigator_3_Profile_ID"].isin(list_id),  "PI3"]=0

dfr.loc[dfr["Principal_Investigator_4_Profile_ID"].isin(list_id),  "PI4"]=1
dfr.loc[~dfr["Principal_Investigator_4_Profile_ID"].isin(list_id),  "PI4"]=0

dfr.loc[dfr["Principal_Investigator_5_Profile_ID"].isin(list_id),  "PI5"]=1
dfr.loc[~dfr["Principal_Investigator_5_Profile_ID"].isin(list_id),  "PI5"]=0

dfr["PI"] = dfr["PI1"] +dfr["PI2"] +dfr["PI3"] +dfr["PI4"] +dfr["PI5"]

dfr["payment_DMdoc"] = dfr['per_payment'] *dfr["PI"]

df_pharmar2 = dfr.pivot_table(index ='Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name', 
                             values= 'payment_DMdoc', 
                             aggfunc = ['sum']).fillna(0)
df_pharmar2["index", 'ID'] = df_pharmar2.index
df_pharmar2 = df_pharmar2.reset_index(level=0, drop=True)
df_pharmar2.columns = df_pharmar2.columns.droplevel(0)
df_pharmar2 = df_pharmar2.reset_index()
df_pharmar2.drop('index', axis=1, inplace=True)
df_pharmar2 = df_pharmar2.rename(columns={'payment_DMdoc': 'Research'})
df_pharmar2.index = df_pharmar2['ID']
df_pharmar2.drop('ID', axis=1, inplace=True)
df_pharmar2

Unnamed: 0_level_0,Research
ID,Unnamed: 1_level_1
ABBVIE INC.,1743979.38
ADAPT Pharma Inc.,804.08
AKEBIA THERAPEUTICS INC,113356.82
"AMAG Pharmaceuticals, Inc.",178157.24
AbbVie Inc.,2133546.80
...,...
ViiV Healthcare Company,832552.75
"Visby Medical, Inc.",44691.41
Welch Allyn,109378.75
"Xeris Pharmaceuticals, Inc.",296730.85


In [None]:
df_pharmag2 = dfg.pivot_table(index ='Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name', 
                                   columns = 'Nature_of_Payment_or_Transfer_of_Value', 
                                  values= 'Total_Amount_of_Payment_USDollars', 
                                   aggfunc = ['sum']).fillna(0)
df_pharmag2["index", "ID"] = df_pharmag2.index
df_pharmag2 = df_pharmag2.reset_index(level=0, drop=True)
df_pharmag2.columns = df_pharmag2.columns.droplevel(0)
df_pharmag2= df_pharmag2.reset_index()
df_pharmag2.drop('index', axis=1, inplace=True)
df_pharmag2.index = df_pharmag2['ID']

df_pharmag2.drop('ID', axis=1, inplace=True)
df_pharmag2

Nature_of_Payment_or_Transfer_of_Value,Acquisitions,Charitable Contribution,"Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program",Compensation for serving as faculty or as a speaker for a medical education program,Compensation for serving as faculty or as a speaker for a non-accredited and noncertified continuing education program,Compensation for serving as faculty or as a speaker for an accredited or certified continuing education program,Consulting Fee,Current or prospective ownership or investment interest,Debt forgiveness,Education,Entertainment,Food and Beverage,Gift,Grant,Honoraria,Long term medical supply or device loan,Royalty or License,Travel and Lodging
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
"A-dec, Inc.",0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,65.00,0.0,0.0,0.0,0.0,0.00
ABB Con-Cise Optical Group LLC,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.52,0.00,0.0,0.0,0.0,0.0,0.00
ABBVIE INC.,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.74,0.00,0.0,0.0,0.0,0.0,0.00
ABIOMED,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2515.57,0.00,0.0,0.0,0.0,0.0,0.00
ACADIA Pharmaceuticals Inc,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1322.37,0.00,0.0,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"diaDexus, Inc.",0.0,0.0,2000.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3089.87,999.18,0.0,5200.0,0.0,0.0,0.00
"iRhythm Technologies, Inc.",0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,236.02,0.00,0.0,0.0,0.0,0.0,0.00
iScreen Vision Inc.,0.0,0.0,0.00,0.0,0.0,0.0,350.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.00
"kaleo, Inc.",0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,322.64,0.00,0.0,0.0,0.0,0.0,0.00


In [None]:
df_pharma2 = pd.concat([df_pharmag2, df_pharmar2],
                        axis = 1).fillna(0)
df_pharma2['ID']= df_pharma2.index
df_pharma2

Unnamed: 0_level_0,Acquisitions,Charitable Contribution,"Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program",Compensation for serving as faculty or as a speaker for a medical education program,Compensation for serving as faculty or as a speaker for a non-accredited and noncertified continuing education program,Compensation for serving as faculty or as a speaker for an accredited or certified continuing education program,Consulting Fee,Current or prospective ownership or investment interest,Debt forgiveness,Education,Entertainment,Food and Beverage,Gift,Grant,Honoraria,Long term medical supply or device loan,Royalty or License,Travel and Lodging,Research,ID
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
"A-dec, Inc.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,65.0,0.0,0.0,0.0,0.0,0.0,0.00,"A-dec, Inc."
ABB Con-Cise Optical Group LLC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.52,0.0,0.0,0.0,0.0,0.0,0.0,0.00,ABB Con-Cise Optical Group LLC
ABBVIE INC.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.74,0.0,0.0,0.0,0.0,0.0,0.0,1743979.38,ABBVIE INC.
ABIOMED,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2515.57,0.0,0.0,0.0,0.0,0.0,0.0,0.00,ABIOMED
ACADIA Pharmaceuticals Inc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1322.37,0.0,0.0,0.0,0.0,0.0,0.0,0.00,ACADIA Pharmaceuticals Inc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Soleno Therapeutics, Inc.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,9845.00,"Soleno Therapeutics, Inc."
"Takeda Development Center Americas, Inc.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,36293.75,"Takeda Development Center Americas, Inc."
Valeant Pharmaceuticals North America LLC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,59999.70,Valeant Pharmaceuticals North America LLC
Valneva Austria GmbH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1832652.09,Valneva Austria GmbH


In [None]:
df_pharma2.to_csv(path_out_profile + "pivot_pay_pharma2.csv", index=None)

## Pivot table by payment nature

### Pivot table by payment natures and Physician profile ID

### Payment nature

In [None]:
dfg['category2'] = dfg['Nature_of_Payment_or_Transfer_of_Value']
dfg['category2'] = dfg['category2'].replace(['Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program'] , 
                                            'non-cme_speaking')
dfg['category2'] = dfg['category2'].replace(['Compensation for serving as faculty or as a speaker for a medical education program',
                                             'Compensation for serving as faculty or as a speaker for a non-accredited and noncertified continuing education program', 
                                             'Compensation for serving as faculty or as a speaker for an accredited or certified continuing education program'],
                                            'cme_speaking')

In [None]:
nature_g = dfg.pivot_table(index = 'Program_Year',
                           columns = 'category2', 
                           values= ['Total_Amount_of_Payment_USDollars', 
                                    'Number_of_Payments_Included_in_Total_Amount'],
                           aggfunc = ['sum'] ).fillna(0)
nature_g.to_csv(path_out1 + "category_general.csv", index=None)
nature_g

Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Number_of_Payments_Included_in_Total_Amount,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars
category2,Acquisitions,Charitable Contribution,Consulting Fee,Current or prospective ownership or investment interest,Debt forgiveness,Education,Entertainment,Food and Beverage,Gift,Grant,Honoraria,Long term medical supply or device loan,Royalty or License,Travel and Lodging,cme_speaking,non-cme_speaking,Acquisitions,Charitable Contribution,Consulting Fee,Current or prospective ownership or investment interest,Debt forgiveness,Education,Entertainment,Food and Beverage,Gift,Grant,Honoraria,Long term medical supply or device loan,Royalty or License,Travel and Lodging,cme_speaking,non-cme_speaking
Program_Year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3,Unnamed: 32_level_3
2013,0.0,0.0,3179.0,0.0,0.0,2617.0,3.0,81132.0,123.0,0.0,409.0,0.0,2.0,10414.0,47.0,9514.0,0.0,0.0,4885299.09,0.0,0.0,68945.79,139.06,2178387.89,151045.1,0.0,720611.36,0.0,115511.34,3497453.9,74480.8,16014879.72
2014,0.0,1.0,5788.0,1.0,0.0,6956.0,3.0,241310.0,110.0,9.0,641.0,0.0,8.0,29304.0,101.0,26800.0,0.0,500.0,10577652.38,232498.0,0.0,208290.63,61.55,5693497.22,5205.03,83998.0,1013126.51,0.0,4664040.71,8364810.4,100948.0,45994804.55
2015,0.0,0.0,5045.0,0.0,0.0,8117.0,3.0,273676.0,30.0,31.0,74.0,0.0,10.0,27743.0,76.0,29920.0,0.0,0.0,8581714.88,0.0,0.0,247685.48,107.11,6185335.0,1469.74,93131.31,245121.3,0.0,6695616.6,8266409.66,99468.0,51258758.21
2016,0.0,1.0,2650.0,0.0,0.0,7011.0,0.0,281190.0,106.0,56.0,316.0,0.0,13.0,28212.0,32.0,26664.0,0.0,1500.0,8362563.0,0.0,0.0,183526.17,0.0,6089812.45,11803.46,224659.26,668710.96,0.0,6437637.27,7987681.11,38017.22,46674953.86
2017,0.0,0.0,3061.0,0.0,0.0,5755.0,5.0,264580.0,368.0,15.0,304.0,0.0,5.0,32107.0,37.0,30024.0,0.0,0.0,8400652.89,0.0,0.0,115599.06,70.28,6215304.96,6354.42,31540.57,648406.75,0.0,5765340.0,9222618.76,66445.02,55260402.25
2018,0.0,0.0,2652.0,0.0,0.0,3804.0,5.0,246072.0,631.0,6.0,450.0,0.0,2.0,26882.0,35.0,25026.0,0.0,0.0,8210956.93,0.0,0.0,75144.38,211.94,5668725.96,52299.39,114304.0,942803.39,0.0,1472413.0,7774921.16,113671.19,45554948.25
2019,0.0,0.0,2504.0,1.0,0.0,4610.0,1.0,231408.0,438.0,7.0,179.0,0.0,2.0,22358.0,189.0,22761.0,0.0,0.0,7809646.08,3500.0,0.0,73671.74,119.3,5095833.78,39438.26,9317.0,427275.0,0.0,7555.89,6969503.63,337822.63,39370376.3
2020,0.0,1.0,1360.0,1.0,0.0,1205.0,0.0,132405.0,218.0,2.0,145.0,0.0,7.0,6126.0,343.0,19535.0,0.0,500.0,3787863.8,333115.5,0.0,47340.66,0.0,2519113.42,5296.55,3197.5,294627.0,0.0,470289.64,1385670.24,455862.66,22376919.08
2021,3.0,0.0,1591.0,1.0,1.0,1250.0,1.0,158596.0,200.0,2.0,229.0,3.0,1.0,4575.0,496.0,17166.0,18763.14,0.0,4193742.98,307881.0,12.62,55395.09,74.0,3155946.72,5519.93,9677.08,424259.98,484.98,67.21,1230075.88,496905.08,21950773.33


In [None]:
df_pivot_g_nature = dfg.pivot_table(index ='Covered_Recipient_Profile_ID', 
                                   columns = 'category2', 
                                   values= 'Total_Amount_of_Payment_USDollars', 
                                   aggfunc = ['sum'] ).fillna(0)

df_pivot_g_nature.columns = df_pivot_g_nature.columns.droplevel(0)
df_pivot_g_nature = df_pivot_g_nature.rename_axis(None, axis=1)
df_pivot_g_nature = df_pivot_g_nature.reset_index()
category_n = (df_pivot_g_nature>0).sum()
category_n.to_csv(path_out1 + "category_n.csv", index=None)
category_n

Covered_Recipient_Profile_ID                               6991
Acquisitions                                                  3
Charitable Contribution                                       3
Consulting Fee                                             2376
Current or prospective ownership or investment interest       2
Debt forgiveness                                              1
Education                                                  4998
Entertainment                                                15
Food and Beverage                                          6845
Gift                                                        912
Grant                                                        90
Honoraria                                                   630
Long term medical supply or device loan                       3
Royalty or License                                           13
Travel and Lodging                                         2679
cme_speaking                            

## Pivot table of products

### General payments

In [None]:
dfg

Unnamed: 0,Change_Type,Covered_Recipient_Profile_ID,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Total_Amount_of_Payment_USDollars,Date_of_Payment,Number_of_Payments_Included_in_Total_Amount,Nature_of_Payment_or_Transfer_of_Value,Contextual_Information,Dispute_Status_for_Publication,Associated_Drug_or_Biological_NDC_1,Associated_Drug_or_Biological_NDC_2,Associated_Drug_or_Biological_NDC_3,Associated_Drug_or_Biological_NDC_4,Associated_Drug_or_Biological_NDC_5,Program_Year,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5,month,day,year,cmonth,category,category2
0,UNCHANGED,263236.0,"Forest Laboratories, Inc.",100000005529,"FOREST PHARMACEUTICALS, INC.",13.43,10/02/2013,1,Food and Beverage,,No,0456140530,,,,,2013,BYSTOLIC,,,,,10,2,2013,3,mealm,Food and Beverage
1,UNCHANGED,117891.0,"Forest Laboratories, Inc.",100000005529,"FOREST PHARMACEUTICALS, INC.",18.86,08/02/2013,1,Education,,No,0456009530,,,,,2013,DALIRESP,,,,,8,2,2013,1,educationm,Education
2,UNCHANGED,198071.0,"Forest Laboratories, Inc.",100000005529,"FOREST PHARMACEUTICALS, INC.",13.35,08/29/2013,1,Food and Beverage,,No,0456080060,0456009530,,,,2013,TUDORZA,DALIRESP,,,,8,29,2013,1,mealm,Food and Beverage
3,UNCHANGED,52956.0,"Forest Laboratories, Inc.",100000005529,"FOREST PHARMACEUTICALS, INC.",12.16,11/05/2013,1,Food and Beverage,,No,0456114030,,,,,2013,VIIBRYD,,,,,11,5,2013,4,mealm,Food and Beverage
4,UNCHANGED,327548.0,"Forest Laboratories, Inc.",100000005529,"FOREST PHARMACEUTICALS, INC.",78.97,08/02/2013,1,Food and Beverage,,No,0456060010,,,,,2013,TEFLARO,,,,,8,2,2013,1,mealm,Food and Beverage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2311353,NEW,302701.0,"Galderma Laboratories, L.P.",100000010375,"Galderma Laboratories, L.P.",85.47,12/04/2021,1,Food and Beverage,US,No,,,,,,2021,,,,,,12,4,2021,101,mealm,Food and Beverage
2311354,NEW,154591.0,"Galderma Laboratories, L.P.",100000010375,"Galderma Laboratories, L.P.",13.18,10/21/2021,1,Food and Beverage,,No,0299-3822-30,0299-5935-30,,,,2021,ORACEA,AKLIEF,,,,10,21,2021,99,mealm,Food and Beverage
2311355,NEW,111418.0,"Penumbra, Inc.",100000010583,"Penumbra, Inc.",25.00,01/18/2021,1,Food and Beverage,,No,,,,,,2021,Indigo System,,,,,1,18,2021,90,mealm,Food and Beverage
2311356,NEW,111418.0,"Penumbra, Inc.",100000010583,"Penumbra, Inc.",146.85,10/19/2021,1,Food and Beverage,,No,,,,,,2021,Penumbra System,,,,,10,19,2021,99,mealm,Food and Beverage


### Make NDC code2 without -

In [None]:
dfg['NDC1']= dfg['Associated_Drug_or_Biological_NDC_1'].str.replace("-","")
dfg['NDC2']= dfg['Associated_Drug_or_Biological_NDC_2'].str.replace("-","")
dfg['NDC3']= dfg['Associated_Drug_or_Biological_NDC_3'].str.replace("-","")
dfg['NDC4']= dfg['Associated_Drug_or_Biological_NDC_4'].str.replace("-","")
dfg['NDC5']= dfg['Associated_Drug_or_Biological_NDC_5'].str.replace("-","")

In [None]:
dfg['product_count']= dfg[['Associated_Drug_or_Biological_NDC_1', 
           'Associated_Drug_or_Biological_NDC_2', 
           'Associated_Drug_or_Biological_NDC_3', 
           'Associated_Drug_or_Biological_NDC_4', 
           'Associated_Drug_or_Biological_NDC_5']].count(axis = 1)

In [None]:
dfg['product_payment'] = round((dfg['Total_Amount_of_Payment_USDollars'] / dfg['product_count']),3)

In [None]:
dfg['product_payment'].replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
df_general_pro_pivot1 = dfg.pivot_table(index ='Associated_Drug_or_Biological_NDC_1', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_general_pro_pivot2 = dfg.pivot_table(index ='Associated_Drug_or_Biological_NDC_2', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum'] ).fillna(0)

df_general_pro_pivot3 = dfg.pivot_table(index ='Associated_Drug_or_Biological_NDC_3', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_general_pro_pivot4 = dfg.pivot_table(index ='Associated_Drug_or_Biological_NDC_4', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_general_pro_pivot5 = dfg.pivot_table(index ='Associated_Drug_or_Biological_NDC_5', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc =['sum'], fill_value=0)

df_general_pro_pivot = pd.concat([df_general_pro_pivot1, 
                               df_general_pro_pivot2, 
                               df_general_pro_pivot3, 
                               df_general_pro_pivot4, 
                               df_general_pro_pivot5], axis = 0).fillna(0)


In [None]:
df_general_pro_pivot.columns = df_general_pro_pivot.columns.droplevel(0)
df_general_pro_pivot = df_general_pro_pivot.rename_axis(None, axis=1)
df_general_pro_pivot = df_general_pro_pivot.reset_index()
df_general_pro_pivot.columns

Index(['index', 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021], dtype='object')

In [None]:
df_general_pro_pivot['index']= df_general_pro_pivot['index'].str.replace("-","")

In [None]:
df_general_pro_pivot

Unnamed: 0,index,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,0000031427,0.00,0.000,0.000,7779314.066,0.0,0.0,0.000,0.00,0.0
1,0000034214,0.00,0.000,0.000,98086.402,0.0,0.0,0.000,0.00,0.0
2,0000034214,36572.88,0.000,0.000,0.000,0.0,0.0,0.000,0.00,0.0
3,0000034221,468.20,0.000,0.000,0.000,0.0,0.0,0.000,0.00,0.0
4,0001860370,0.00,11.220,7224.364,6452.062,0.0,0.0,0.000,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...
3625,6658232354.0,0.00,163.114,0.000,0.000,0.0,0.0,0.000,0.00,0.0
3626,667800226.0,0.00,114.068,0.000,0.000,0.0,0.0,0.000,0.00,0.0
3627,6791904101,0.00,0.000,2.726,0.000,0.0,0.0,29.636,0.00,0.0
3628,7118000315,0.00,0.000,0.000,0.000,0.0,0.0,2.426,13.26,0.0


In [None]:
year = [2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021]
dfs_names = ('df1', 'df2', 'df3', 'df4', 'df5', 'df6', 'df7', 'df8', 'df9')
dfs ={}
for dfn, year in zip(dfs_names, year): 
   dfs[dfn] = df_general_pro_pivot.pivot_table(index = 'index', 
                                  values= year, 
                                   aggfunc =['sum'], fill_value=0)


In [None]:
df_general_pro_pivot = pd.concat([dfs['df1'], 
                                  dfs['df2'], 
                                  dfs['df3'], 
                                  dfs['df4'], 
                                  dfs['df5'], 
                                  dfs['df6'], 
                                  dfs['df7'], 
                                  dfs['df8'], 
                                  dfs['df9']], axis = 1).fillna(0)

df_general_pro_pivot.columns = df_general_pro_pivot.columns.droplevel(0)
df_general_pro_pivot = df_general_pro_pivot.rename_axis(None, axis=1)
df_general_pro_pivot = df_general_pro_pivot.reset_index()

df_general_pro_pivot['total'] = df_general_pro_pivot[2013] + \
+ df_general_pro_pivot[2014] + df_general_pro_pivot[2015] + \
df_general_pro_pivot[2016] + df_general_pro_pivot[2017] + \
df_general_pro_pivot[2018] + df_general_pro_pivot[2019] + \
df_general_pro_pivot[2020] + df_general_pro_pivot[2021] 

df_general_pro_pivot

Unnamed: 0,index,2013,2014,2015,2016,2017,2018,2019,2020,2021,total
0,0000031427,0.000,0.000,0.000,7783147.536,0.0,0.00,0.0,0.0,0.0,7783147.536
1,0000034214,36709.662,0.000,0.000,98462.758,0.0,0.00,0.0,0.0,0.0,135172.420
2,0000034221,475.600,0.000,0.000,0.000,0.0,0.00,0.0,0.0,0.0,475.600
3,0001860004,0.000,87.625,0.000,0.000,0.0,0.00,0.0,0.0,0.0,87.625
4,0001860370,2374.054,7382.696,8426.617,7734.287,0.0,0.00,0.0,0.0,0.0,25917.654
...,...,...,...,...,...,...,...,...,...,...,...
1987,9513701.0,0.000,189.200,0.000,0.000,0.0,0.00,0.0,0.0,0.0,189.200
1988,9517602.0,0.000,22371.470,0.000,0.000,0.0,0.00,0.0,0.0,0.0,22371.470
1989,9920704633,0.000,60.000,0.000,0.000,0.0,0.00,0.0,0.0,0.0,60.000
1990,9920746330,0.000,0.000,99.670,26.637,0.0,0.00,0.0,0.0,0.0,126.307


In [None]:
df_general_pro_pivot = round(df_general_pro_pivot, 2)

In [None]:
df_general_pro_pivot.to_csv(path_out1 + "general_product.csv", index=None)

### example

In [None]:
df_general_pro_pivot['total'] = df_general_pro_pivot[2013] + \
+ df_general_pro_pivot[2014] + df_general_pro_pivot[2015] + \
df_general_pro_pivot[2016] + df_general_pro_pivot[2017] + \
df_general_pro_pivot[2018] + df_general_pro_pivot[2019] + \
df_general_pro_pivot[2020] + df_general_pro_pivot[2021] 
df_general_pro_pivot.sort_values('total', ascending=False)

Unnamed: 0,index,2013,2014,2015,2016,2017,2018,2019,2020,2021,total
881,5045814030,0.00,1013372.54,9640686.15,9163967.27,8120515.62,4035983.06,3591069.00,509976.60,10764.32,36086334.56
11,0002143380,0.00,121140.80,2794020.06,3895820.83,6139293.06,5893238.16,6530487.08,3896840.32,2836324.90,32107165.21
190,0024586903,0.00,0.00,6651396.93,7975282.62,4913955.93,3683190.50,1520317.61,346970.62,675383.50,25766497.71
355,0169406090,5347392.11,7024172.01,5434116.87,3880297.94,3677053.50,0.00,0.00,0.00,0.00,25363032.43
554,0667800212,0.00,4590899.00,6664027.00,6407390.00,5766290.96,0.00,0.00,0.00,0.00,23428606.96
...,...,...,...,...,...,...,...,...,...,...,...
1618,6468200.0,0.00,1.41,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.41
1641,6482700.0,0.00,1.22,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.22
1608,6404741.0,0.00,1.15,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.15
694,2814801.0,1.09,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.09


### Associated research payments

In [None]:
dfr

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Covered_Recipient_Profile_ID,Covered_Recipient_First_Name,Covered_Recipient_Middle_Name,Covered_Recipient_Last_Name,...,PI_OPDcount,PI_OPD_per_payment,NDC1,NDC2,NDC3,NDC4,NDC5,product_count,product_payment,total_productpay
0,UNCHANGED,Covered Recipient Teaching Hospital,0,70033.0,46.0,DANBURY HOSPITAL,0.0,0,0,0,...,1,128.7,0169266015,0,0,0.0,0.0,5,3,128.7
1,UNCHANGED,Covered Recipient Teaching Hospital,0,374000.0,904.0,State Of Oklahoma,0.0,0,0,0,...,1,67.5,0169266015,0,0,0.0,0.0,5,3,67.5
2,UNCHANGED,Covered Recipient Teaching Hospital,0,450647.0,811.0,Columbia Hospital At Medical City Dallas Subsi...,0.0,0,0,0,...,1,2500.0,5045814001,0,0,0.0,0.0,5,3,2500.0
3,UNCHANGED,Covered Recipient Teaching Hospital,0,300003.0,5.0,Mary Hitchcock Memorial Hospital,0.0,0,0,0,...,1,447.7,0169406090,0,0,0.0,0.0,5,3,447.7
4,UNCHANGED,Covered Recipient Teaching Hospital,0,260162.0,593.0,Barnes Jewish West County Hospital,0.0,0,0,0,...,1,84.0,0,0,0,0.0,0.0,5,3,84.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274907,NEW,Non-covered Recipient Entity,ST LOUIS UNIV,0.0,0.0,0,0.0,0,0,0,...,1,1005.0,0169413212,0,0,0.0,0.0,5,3,1005.0
274908,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,0.0,0.0,0,0.0,0,0,0,...,1,9.0,0169413212,0,0,0.0,0.0,5,3,9.0
274909,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,0.0,0.0,0,0.0,0,0,0,...,1,5.3,0169430313,0,0,0.0,0.0,5,3,5.3
274910,NEW,Non-covered Recipient Entity,FOUNDATION FOR ADVANCING VETERANS' HEALTH RESE...,0.0,0.0,0,0.0,0,0,0,...,1,56.9,0,0,0,0.0,0.0,5,3,56.9


In [None]:
dfr['NDC1']= dfr['Associated_Drug_or_Biological_NDC_1'].str.replace("-","")
dfr['NDC2']= dfr['Associated_Drug_or_Biological_NDC_2'].str.replace("-","")
dfr['NDC3']= dfr['Associated_Drug_or_Biological_NDC_3'].str.replace("-","")
dfr['NDC4']= dfr['Associated_Drug_or_Biological_NDC_4']
dfr['NDC5']= dfr['Associated_Drug_or_Biological_NDC_5']

In [None]:
dfr['NDC1'] = dfr['NDC1'].replace(0.0, 0)
dfr['NDC2'] = dfr['NDC2'].replace(0.0, 0)
dfr['NDC3'] = dfr['NDC3'].replace(0.0, 0)
dfr['NDC4'] = dfr['NDC4'].replace(0.0, 0)
dfr['NDC5'] = dfr['NDC5'].replace(0.0, 0)

In [None]:
dfr[['NDC1', 'NDC2', 'NDC3', 'NDC4', 'NDC5']] = dfr[['NDC1', 'NDC2', 'NDC3', 'NDC4', 'NDC5']].astype(int)

In [None]:
dfr['product_count']= dfr[['NDC1', 'NDC2', 'NDC3', 'NDC4', 'NDC5']].apply(lambda x: (x != 0).sum(), axis=1)
dfr['product_count']                              

0         1
1         1
2         1
3         1
4         0
         ..
274907    1
274908    1
274909    1
274910    0
274911    0
Name: product_count, Length: 274912, dtype: int64

In [None]:
dfr[['NDC1', 'NDC2', 'NDC3', 'NDC4', 'NDC5']]

Unnamed: 0,NDC1,NDC2,NDC3,NDC4,NDC5
0,169266015,0,0,0,0
1,169266015,0,0,0,0
2,5045814001,0,0,0,0
3,169406090,0,0,0,0
4,0,0,0,0,0
...,...,...,...,...,...
274907,169413212,0,0,0,0
274908,169413212,0,0,0,0
274909,169430313,0,0,0,0
274910,0,0,0,0,0


In [None]:
dfr['product_payment'] = round((dfr['PI_OPD_per_payment'] / dfr['product_count']),5)           
dfr['product_payment'].replace([np.inf, -np.inf], 0, inplace=True)
dfr['total_productpay'] = round((dfr['product_payment'] * dfr['product_count']),5) 

In [None]:
dfr['PI_OPD_per_payment'].sum(axis=0)

1105404960.7660701

In [None]:
dfr['product_count'].sum(axis=0)

158244

In [None]:
dfr['total_productpay'].sum(axis=0)

523363867.1288199

In [None]:
dfr

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Covered_Recipient_Profile_ID,Covered_Recipient_First_Name,Covered_Recipient_Middle_Name,Covered_Recipient_Last_Name,...,PI_OPDcount,PI_OPD_per_payment,NDC1,NDC2,NDC3,NDC4,NDC5,product_count,product_payment,total_productpay
0,UNCHANGED,Covered Recipient Teaching Hospital,0,70033.0,46.0,DANBURY HOSPITAL,0.0,0,0,0,...,1,128.7,169266015,0,0,0,0,1,128.7,128.7
1,UNCHANGED,Covered Recipient Teaching Hospital,0,374000.0,904.0,State Of Oklahoma,0.0,0,0,0,...,1,67.5,169266015,0,0,0,0,1,67.5,67.5
2,UNCHANGED,Covered Recipient Teaching Hospital,0,450647.0,811.0,Columbia Hospital At Medical City Dallas Subsi...,0.0,0,0,0,...,1,2500.0,5045814001,0,0,0,0,1,2500.0,2500.0
3,UNCHANGED,Covered Recipient Teaching Hospital,0,300003.0,5.0,Mary Hitchcock Memorial Hospital,0.0,0,0,0,...,1,447.7,169406090,0,0,0,0,1,447.7,447.7
4,UNCHANGED,Covered Recipient Teaching Hospital,0,260162.0,593.0,Barnes Jewish West County Hospital,0.0,0,0,0,...,1,84.0,0,0,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274907,NEW,Non-covered Recipient Entity,ST LOUIS UNIV,0.0,0.0,0,0.0,0,0,0,...,1,1005.0,169413212,0,0,0,0,1,1005.0,1005.0
274908,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,0.0,0.0,0,0.0,0,0,0,...,1,9.0,169413212,0,0,0,0,1,9.0,9.0
274909,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,0.0,0.0,0,0.0,0,0,0,...,1,5.3,169430313,0,0,0,0,1,5.3,5.3
274910,NEW,Non-covered Recipient Entity,FOUNDATION FOR ADVANCING VETERANS' HEALTH RESE...,0.0,0.0,0,0.0,0,0,0,...,1,56.9,0,0,0,0,0,0,0.0,0.0


In [None]:
df_research_pro_pivot1 = dfr.pivot_table(index ='NDC1', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pro_pivot2 = dfr.pivot_table(index ='NDC2', 
                                         columns = 'Program_Year', 
                                         values= 'product_payment',
                                         aggfunc = ['sum'] ).fillna(0)

df_research_pro_pivot3 = dfr.pivot_table(index ='NDC3', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pro_pivot4 = dfr.pivot_table(index ='NDC4', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pro_pivot5 = dfr.pivot_table(index ='NDC5', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pro_pivot = pd.concat([df_research_pro_pivot1, 
                               df_research_pro_pivot2, 
                               df_research_pro_pivot3,
                               df_research_pro_pivot4,
                               df_research_pro_pivot5], axis = 0).fillna(0)
df_research_pro_pivot.columns = df_research_pro_pivot.columns.droplevel(0)
df_research_pro_pivot = df_research_pro_pivot.rename_axis(None, axis=1)
df_research_pro_pivot = df_research_pro_pivot.reset_index()
df_research_pro_pivot.columns

Index(['index', 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021], dtype='object')

In [None]:
year = [2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021]
dfs_names = ('df1', 'df2', 'df3', 'df4', 'df5', 'df6', 'df7', 'df8', 'df9')
dfs ={}
for dfn, year in zip(dfs_names, year): 
   dfs[dfn] = df_research_pro_pivot.pivot_table(index = 'index', 
                                                values= year, 
                                                aggfunc =['sum'], fill_value=0)
   
df_research_pro_pivot = pd.concat([dfs['df1'], 
                                  dfs['df2'], 
                                  dfs['df3'], 
                                  dfs['df4'], 
                                  dfs['df5'], 
                                  dfs['df6'], 
                                  dfs['df7'], 
                                  dfs['df8'], 
                                  dfs['df9']], axis = 1).fillna(0)

df_research_pro_pivot.columns = df_research_pro_pivot.columns.droplevel(0)
df_research_pro_pivot = df_research_pro_pivot.rename_axis(None, axis=1)
df_research_pro_pivot = df_research_pro_pivot.reset_index()

df_research_pro_pivot['total'] = df_research_pro_pivot[2013] + \
+ df_research_pro_pivot[2014] + df_research_pro_pivot[2015] + \
df_research_pro_pivot[2016] + df_research_pro_pivot[2017] + \
df_research_pro_pivot[2018] + df_research_pro_pivot[2019] + \
df_research_pro_pivot[2020] + df_research_pro_pivot[2021] 

df_research_pro_pivot

Unnamed: 0,index,2013,2014,2015,2016,2017,2018,2019,2020,2021,total
0,0,7.912439e+07,2.574010e+08,3.257950e+08,3.590212e+08,3.494838e+08,2.385449e+08,2.102366e+08,1.521150e+08,1.128767e+08,2.084599e+09
1,31427,1.707073e+05,0.000000e+00,0.000000e+00,1.859181e+06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.029888e+06
2,34214,1.932853e+05,0.000000e+00,0.000000e+00,1.148919e+06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.342204e+06
3,1860776,0.000000e+00,0.000000e+00,2.742396e+05,2.505943e+05,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,5.248339e+05
4,1860777,4.371997e+04,1.350617e+05,0.000000e+00,0.000000e+00,1.527673e+05,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.315489e+05
...,...,...,...,...,...,...,...,...,...,...,...
290,7598705006,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.680000e+02,0.000000e+00,0.000000e+00,0.000000e+00,4.680000e+02
291,7598713015,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.130700e+04,3.779100e+04,5.909800e+04
292,7643110501,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,9.625000e+01,0.000000e+00,9.625000e+01
293,7643121001,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,5.758896e+04,1.140102e+05,1.715992e+05


In [None]:
df_research_pro_pivot['total'].sum()

2607962631.7111

In [None]:
df_research_pro_pivot.to_csv(path_out2 + "research_product.csv", index=None)

### Direct research payments

In [None]:
dfrd

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Covered_Recipient_Profile_ID,Covered_Recipient_First_Name,Covered_Recipient_Middle_Name,Covered_Recipient_Last_Name,Covered_Recipient_Name_Suffix,Recipient_Primary_Business_Street_Address_Line1,Recipient_Primary_Business_Street_Address_Line2,Recipient_City,Recipient_State,Recipient_Zip_Code,Recipient_Country,Recipient_Province,Recipient_Postal_Code,Covered_Recipient_Primary_Type_1,Covered_Recipient_Specialty_1,Covered_Recipient_License_State_code1,Covered_Recipient_License_State_code2,Covered_Recipient_License_State_code3,Covered_Recipient_License_State_code4,Covered_Recipient_License_State_code5,Principal_Investigator_1_Profile_ID,Principal_Investigator_1_First_Name,Principal_Investigator_1_Middle_Name,Principal_Investigator_1_Last_Name,Principal_Investigator_1_Name_Suffix,Principal_Investigator_1_Business_Street_Address_Line1,Principal_Investigator_1_Business_Street_Address_Line2,Principal_Investigator_1_City,Principal_Investigator_1_State,Principal_Investigator_1_Zip_Code,Principal_Investigator_1_Country,Principal_Investigator_1_Province,Principal_Investigator_1_Postal_Code,Principal_Investigator_1_Primary_Type_1,Principal_Investigator_1_Specialty_1,Principal_Investigator_1_License_State_code1,Principal_Investigator_1_License_State_code2,Principal_Investigator_1_License_State_code3,Principal_Investigator_1_License_State_code4,Principal_Investigator_1_License_State_code5,Principal_Investigator_2_Profile_ID,Principal_Investigator_2_First_Name,Principal_Investigator_2_Middle_Name,Principal_Investigator_2_Last_Name,Principal_Investigator_2_Name_Suffix,Principal_Investigator_2_Business_Street_Address_Line1,Principal_Investigator_2_Business_Street_Address_Line2,Principal_Investigator_2_City,Principal_Investigator_2_State,Principal_Investigator_2_Zip_Code,Principal_Investigator_2_Country,Principal_Investigator_2_Province,Principal_Investigator_2_Postal_Code,Principal_Investigator_2_Primary_Type_1,Principal_Investigator_2_Specialty_1,Principal_Investigator_2_License_State_code1,Principal_Investigator_2_License_State_code2,Principal_Investigator_2_License_State_code3,Principal_Investigator_2_License_State_code4,Principal_Investigator_2_License_State_code5,Principal_Investigator_3_Profile_ID,Principal_Investigator_3_First_Name,Principal_Investigator_3_Middle_Name,Principal_Investigator_3_Last_Name,Principal_Investigator_3_Name_Suffix,Principal_Investigator_3_Business_Street_Address_Line1,Principal_Investigator_3_Business_Street_Address_Line2,Principal_Investigator_3_City,Principal_Investigator_3_State,Principal_Investigator_3_Zip_Code,Principal_Investigator_3_Country,Principal_Investigator_3_Province,Principal_Investigator_3_Postal_Code,Principal_Investigator_3_Primary_Type_1,Principal_Investigator_3_Specialty_1,Principal_Investigator_3_License_State_code1,Principal_Investigator_3_License_State_code2,Principal_Investigator_3_License_State_code3,Principal_Investigator_3_License_State_code4,Principal_Investigator_3_License_State_code5,Principal_Investigator_4_Profile_ID,Principal_Investigator_4_First_Name,Principal_Investigator_4_Middle_Name,Principal_Investigator_4_Last_Name,Principal_Investigator_4_Name_Suffix,Principal_Investigator_4_Business_Street_Address_Line1,Principal_Investigator_4_Business_Street_Address_Line2,Principal_Investigator_4_City,Principal_Investigator_4_State,...,Principal_Investigator_2_NPI,Principal_Investigator_3_NPI,Principal_Investigator_4_NPI,Principal_Investigator_5_NPI,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5,Covered_Recipient_NPI,Covered_Recipient_Primary_Type_2,Covered_Recipient_Primary_Type_3,Covered_Recipient_Primary_Type_4,Covered_Recipient_Primary_Type_5,Covered_Recipient_Primary_Type_6,Covered_Recipient_Specialty_2,Covered_Recipient_Specialty_3,Covered_Recipient_Specialty_4,Covered_Recipient_Specialty_5,Covered_Recipient_Specialty_6,Principal_Investigator_1_Covered_Recipient_Type,Principal_Investigator_1_Primary_Type_2,Principal_Investigator_1_Primary_Type_3,Principal_Investigator_1_Primary_Type_4,Principal_Investigator_1_Primary_Type_5,Principal_Investigator_1_Primary_Type_6,Principal_Investigator_1_Specialty_2,Principal_Investigator_1_Specialty_3,Principal_Investigator_1_Specialty_4,Principal_Investigator_1_Specialty_5,Principal_Investigator_1_Specialty_6,Principal_Investigator_2_Covered_Recipient_Type,Principal_Investigator_2_Primary_Type_2,Principal_Investigator_2_Primary_Type_3,Principal_Investigator_2_Primary_Type_4,Principal_Investigator_2_Primary_Type_5,Principal_Investigator_2_Primary_Type_6,Principal_Investigator_2_Specialty_2,Principal_Investigator_2_Specialty_3,Principal_Investigator_2_Specialty_4,Principal_Investigator_2_Specialty_5,Principal_Investigator_2_Specialty_6,Principal_Investigator_3_Covered_Recipient_Type,Principal_Investigator_3_Primary_Type_2,Principal_Investigator_3_Primary_Type_3,Principal_Investigator_3_Primary_Type_4,Principal_Investigator_3_Primary_Type_5,Principal_Investigator_3_Primary_Type_6,Principal_Investigator_3_Specialty_2,Principal_Investigator_3_Specialty_3,Principal_Investigator_3_Specialty_4,Principal_Investigator_3_Specialty_5,Principal_Investigator_3_Specialty_6,Principal_Investigator_4_Covered_Recipient_Type,Principal_Investigator_4_Primary_Type_2,Principal_Investigator_4_Primary_Type_3,Principal_Investigator_4_Primary_Type_4,Principal_Investigator_4_Primary_Type_5,Principal_Investigator_4_Primary_Type_6,Principal_Investigator_4_Specialty_2,Principal_Investigator_4_Specialty_3,Principal_Investigator_4_Specialty_4,Principal_Investigator_4_Specialty_5,Principal_Investigator_4_Specialty_6,Principal_Investigator_5_Covered_Recipient_Type,Principal_Investigator_5_Primary_Type_2,Principal_Investigator_5_Primary_Type_3,Principal_Investigator_5_Primary_Type_4,Principal_Investigator_5_Primary_Type_5,Principal_Investigator_5_Primary_Type_6,Principal_Investigator_5_Specialty_2,Principal_Investigator_5_Specialty_3,Principal_Investigator_5_Specialty_4,Principal_Investigator_5_Specialty_5,Principal_Investigator_5_Specialty_6,Covered_or_Noncovered_Indicator_1,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1,Product_Category_or_Therapeutic_Area_1,Associated_Device_or_Medical_Supply_PDI_1,Covered_or_Noncovered_Indicator_2,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_2,Product_Category_or_Therapeutic_Area_2,Associated_Device_or_Medical_Supply_PDI_2,Covered_or_Noncovered_Indicator_3,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3,Product_Category_or_Therapeutic_Area_3,Associated_Device_or_Medical_Supply_PDI_3,Covered_or_Noncovered_Indicator_4,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4,Product_Category_or_Therapeutic_Area_4,Associated_Device_or_Medical_Supply_PDI_4,Covered_or_Noncovered_Indicator_5,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5,Product_Category_or_Therapeutic_Area_5,Associated_Device_or_Medical_Supply_PDI_5
0,UNCHANGED,Covered Recipient Physician,,,,,223086.0,DAVID,,LILJENQUIST,,2220 E 25TH ST,,IDAHO FALLS,ID,83404,United States,,,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,ID,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,SYNCRIA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,UNCHANGED,Covered Recipient Physician,,,,,214700.0,NELSON,BARNETT,WATTS,,4760 E GALBRAITH RD,STE 208,CINCINNATI,OH,45236-6703,United States,,,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,OH,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,NATPARA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,UNCHANGED,Covered Recipient Physician,,,,,65906.0,DAN,ALEXANDRU,STREJA,,7345 MEDICAL CENTER DR STE 600,,WEST HILLS,CA,91307,United States,,,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,CA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,MK-3102,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,UNCHANGED,Covered Recipient Physician,,,,,651870.0,RONALD,K,MAYFIELD,,100 E. Wood Street Suite 101,,SPARTANBURG,SC,29303,United States,,,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,SC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,MK-3102,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,UNCHANGED,Covered Recipient Physician,,,,,222358.0,OPADA,,ALZOHAILI,,540 E Canfield St Dept Med,,Detroit,MI,48201,United States,,,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,MI,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,MK-3102,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12390,NEW,Covered Recipient Physician,,,,,276270.0,ILEANA,JOSEFINA,TANDRON,,2240 GAUSE BLVD E,,SLIDELL,LA,70461,United States,,,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,LA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,RYBELSUS,,,,,1.083615e+09,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Covered,Drug,Diabetes,,,,,,,,,,,,,,,,,
12391,NEW,Covered Recipient Physician,,,,,276270.0,ILEANA,JOSEFINA,TANDRON,,2240 GAUSE BLVD E,,SLIDELL,LA,70461,United States,,,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,LA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,RYBELSUS,,,,,1.083615e+09,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Covered,Drug,Diabetes,,,,,,,,,,,,,,,,,
12392,NEW,Covered Recipient Physician,,,,,276270.0,ILEANA,JOSEFINA,TANDRON,,2240 GAUSE BLVD E,,SLIDELL,LA,70461,United States,,,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,LA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,RYBELSUS,,,,,1.083615e+09,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Covered,Drug,Diabetes,,,,,,,,,,,,,,,,,
12393,NEW,Covered Recipient Physician,,,,,181001.0,CARLOS,,ARAUZ PACHECO,,12606 GREENVILLE AVE,STE 215,DALLAS,TX,75243,United States,,,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,TX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,Ozempic,,,,,1.992766e+09,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Covered,Drug,Diabetes,,,,,,,,,,,,,,,,,


In [None]:
dfrd['NDC1']= dfrd['Associated_Drug_or_Biological_NDC_1'].str.replace("-","")
dfrd['NDC2']= dfrd['Associated_Drug_or_Biological_NDC_2'].str.replace("-","")
dfrd['NDC3']= dfrd['Associated_Drug_or_Biological_NDC_3'].str.replace("-","")

In [None]:
dfrd['NDC4']= dfrd['Associated_Drug_or_Biological_NDC_4'].str.replace("-","")
dfrd['NDC5']= dfrd['Associated_Drug_or_Biological_NDC_5'].str.replace("-","")

In [None]:
dfrd['product_count']= dfrd[['Associated_Drug_or_Biological_NDC_1', 
           'Associated_Drug_or_Biological_NDC_2', 
           'Associated_Drug_or_Biological_NDC_3', 
           'Associated_Drug_or_Biological_NDC_4', 
           'Associated_Drug_or_Biological_NDC_5']].count(axis = 1)
dfrd['product_payment'] = round((dfrd['Total_Amount_of_Payment_USDollars'] / dfrd['product_count']),5)           
dfrd['product_payment'].replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
df_research_pro_pivot1 = dfrd.pivot_table(index ='NDC1', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pro_pivot2 = dfrd.pivot_table(index ='NDC2', 
                                         columns = 'Program_Year', 
                                         values= 'product_payment',
                                         aggfunc = ['sum'] ).fillna(0)

df_research_pro_pivot3 = dfrd.pivot_table(index ='NDC3', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pro_pivot4 = dfrd.pivot_table(index ='NDC4', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pro_pivot5 = dfrd.pivot_table(index ='NDC5', 
                                   columns = 'Program_Year', 
                                  values= 'product_payment', 
                                   aggfunc = ['sum']).fillna(0)

df_research_pro_pivot = pd.concat([df_research_pro_pivot1, 
                               df_research_pro_pivot2, 
                               df_research_pro_pivot3], axis = 0).fillna(0)
df_research_pro_pivot.columns = df_research_pro_pivot.columns.droplevel(0)
df_research_pro_pivot = df_research_pro_pivot.rename_axis(None, axis=1)
df_research_pro_pivot = df_research_pro_pivot.reset_index()
df_research_pro_pivot.columns

Index(['index', 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021], dtype='object')

In [None]:
year = [2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021]
dfs_names = ('df1', 'df2', 'df3', 'df4', 'df5', 'df6', 'df7', 'df8', 'df9')
dfs ={}
for dfn, year in zip(dfs_names, year): 
   dfs[dfn] = df_research_pro_pivot.pivot_table(index = 'index', 
                                  values= year, 
                                   aggfunc =['sum'], fill_value=0)
   
df_research_pro_pivot = pd.concat([dfs['df1'], 
                                  dfs['df2'], 
                                  dfs['df3'], 
                                  dfs['df4'], 
                                  dfs['df5'], 
                                  dfs['df6'], 
                                  dfs['df7'], 
                                  dfs['df8'], 
                                  dfs['df9']], axis = 1).fillna(0)

df_research_pro_pivot.columns = df_research_pro_pivot.columns.droplevel(0)
df_research_pro_pivot = df_research_pro_pivot.rename_axis(None, axis=1)
df_research_pro_pivot = df_research_pro_pivot.reset_index()

df_research_pro_pivot['total'] = df_research_pro_pivot[2013] + \
+ df_research_pro_pivot[2014] + df_research_pro_pivot[2015] + \
df_research_pro_pivot[2016] + df_research_pro_pivot[2017] + \
df_research_pro_pivot[2018] + df_research_pro_pivot[2019] + \
df_research_pro_pivot[2020] + df_research_pro_pivot[2021] 

df_research_pro_pivot

Unnamed: 0,index,2013,2014,2015,2016,2017,2018,2019,2020,2021,total
0,0000031427,2016.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2016.00
1,0000034214,2000.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2000.00
2,0002143380,0.0,52132.89,71466.49,113054.85,87219.09,175047.76,79923.43,22085.87,10916.49,611846.87
3,0002144511,0.0,0.00,0.00,568.00,0.00,0.00,0.00,0.00,0.00,568.00
4,0002197590,52.0,7.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,59.50
...,...,...,...,...,...,...,...,...,...,...,...
97,7019400330,0.0,0.00,0.00,0.00,0.00,0.00,9584.00,0.00,0.00,9584.00
98,7185800104,0.0,0.00,0.00,0.00,0.00,0.00,67285.00,0.00,0.00,67285.00
99,7251176002,0.0,0.00,0.00,0.00,0.00,0.00,50403.21,0.00,0.00,50403.21
100,7261830002,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,500.00,500.00


In [None]:
df_research_pro_pivot['total'].sum()

10901983.57

In [None]:
df_research_pro_pivot.to_csv(path_out2 + "direct_research_product.csv", index=None)

# Payments analysis

## General payments

In [11]:
dfg.sort_values(by=['Total_Amount_of_Payment_USDollars'], ascending=False)

Unnamed: 0,Change_Type,Covered_Recipient_Profile_ID,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Total_Amount_of_Payment_USDollars,Date_of_Payment,Number_of_Payments_Included_in_Total_Amount,Nature_of_Payment_or_Transfer_of_Value,Contextual_Information,...,Associated_Drug_or_Biological_NDC_2,Associated_Drug_or_Biological_NDC_3,Associated_Drug_or_Biological_NDC_4,Associated_Drug_or_Biological_NDC_5,Program_Year,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5
722204,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1826075.0,11/16/2015,1,Royalty or License,,...,,,,,2015,BYETTA,,,,
722205,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1734921.0,05/14/2015,1,Royalty or License,,...,,,,,2015,BYETTA,,,,
1015348,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1720480.0,08/14/2016,1,Royalty or License,,...,,,,,2016,BYETTA,,,,
722203,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1695981.0,08/20/2015,1,Royalty or License,,...,,,,,2015,BYETTA,,,,
1015347,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1653615.0,11/02/2016,1,Royalty or License,,...,,,,,2016,BYETTA,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245154,UNCHANGED,54579.0,Sanofi and Genzyme US Companies,100000000076,SANOFI-AVENTIS U.S. LLC,0.0,11/21/2014,1,Travel and Lodging,,...,,,,,2014,,,,,
245276,UNCHANGED,506323.0,Sanofi and Genzyme US Companies,100000000076,SANOFI-AVENTIS U.S. LLC,0.0,02/13/2014,1,Travel and Lodging,,...,,,,,2014,,,,,
245316,UNCHANGED,275081.0,Sanofi and Genzyme US Companies,100000000076,SANOFI-AVENTIS U.S. LLC,0.0,02/27/2014,1,Compensation for services other than consultin...,,...,,,,,2014,,,,,
245332,UNCHANGED,361549.0,Sanofi and Genzyme US Companies,100000000076,SANOFI-AVENTIS U.S. LLC,0.0,08/20/2014,1,Compensation for services other than consultin...,,...,,,,,2014,,,,,


In [16]:
dfg_check = dfg[dfg['Total_Amount_of_Payment_USDollars'] > 1000000]
dfg['Total_Amount_of_Payment_USDollars'][dfg['Total_Amount_of_Payment_USDollars'] > 1000000].count()

16

In [17]:
dfg_check

Unnamed: 0,Change_Type,Covered_Recipient_Profile_ID,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Total_Amount_of_Payment_USDollars,Date_of_Payment,Number_of_Payments_Included_in_Total_Amount,Nature_of_Payment_or_Transfer_of_Value,Contextual_Information,...,Associated_Drug_or_Biological_NDC_2,Associated_Drug_or_Biological_NDC_3,Associated_Drug_or_Biological_NDC_4,Associated_Drug_or_Biological_NDC_5,Program_Year,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5
408282,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1485149.0,05/15/2014,1,Royalty or License,,...,,,,,2014,BYETTA,,,,
408283,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1477915.0,08/15/2014,1,Royalty or License,,...,,,,,2014,BYETTA,,,,
408284,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1627835.0,11/13/2014,1,Royalty or License,,...,,,,,2014,BYETTA,,,,
722202,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1407050.0,02/15/2015,1,Royalty or License,,...,,,,,2015,BYETTA,,,,
722203,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1695981.0,08/20/2015,1,Royalty or License,,...,,,,,2015,BYETTA,,,,
722204,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1826075.0,11/16/2015,1,Royalty or License,,...,,,,,2015,BYETTA,,,,
722205,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1734921.0,05/14/2015,1,Royalty or License,,...,,,,,2015,BYETTA,,,,
1015347,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1653615.0,11/02/2016,1,Royalty or License,,...,,,,,2016,BYETTA,,,,
1015348,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1720480.0,08/14/2016,1,Royalty or License,,...,,,,,2016,BYETTA,,,,
1015349,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1505772.0,05/15/2016,1,Royalty or License,,...,,,,,2016,BYETTA,,,,


In [15]:
dfg_check = dfg[dfg['Total_Amount_of_Payment_USDollars'] > 100000]
dfg_check

Unnamed: 0,Change_Type,Covered_Recipient_Profile_ID,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Total_Amount_of_Payment_USDollars,Date_of_Payment,Number_of_Payments_Included_in_Total_Amount,Nature_of_Payment_or_Transfer_of_Value,Contextual_Information,...,Associated_Drug_or_Biological_NDC_2,Associated_Drug_or_Biological_NDC_3,Associated_Drug_or_Biological_NDC_4,Associated_Drug_or_Biological_NDC_5,Program_Year,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5
71006,UNCHANGED,240104.0,Sanofi and Genzyme US Companies,100000000076,SANOFI-AVENTIS U.S. LLC,323145.0,10/03/2013,1,Consulting Fee,Made pursuant to a Settlement Agreement resolv...,...,,,,,2013,,,,,
71007,UNCHANGED,16338.0,Sanofi and Genzyme US Companies,100000000076,SANOFI-AVENTIS U.S. LLC,273933.0,10/03/2013,1,Compensation for services other than consultin...,Made pursuant to a Settlement Agreement resolv...,...,,,,,2013,,,,,
76119,UNCHANGED,240104.0,Sanofi and Genzyme US Companies,100000000076,SANOFI-AVENTIS U.S. LLC,273933.0,10/03/2013,1,Compensation for services other than consultin...,Made pursuant to a Settlement Agreement resolv...,...,,,,,2013,,,,,
76834,UNCHANGED,16338.0,Sanofi and Genzyme US Companies,100000000076,SANOFI-AVENTIS U.S. LLC,217738.0,10/03/2013,1,Consulting Fee,Made pursuant to a Settlement Agreement resolv...,...,,,,,2013,,,,,
235396,UNCHANGED,64469.0,"Dexcom, Inc.",100000010478,"Dexcom, Inc.",232498.0,05/30/2014,1,Current or prospective ownership or investment...,Annual stock grant to member of Dexcom Board o...,...,,,,,2014,,,,,
408282,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1485149.0,05/15/2014,1,Royalty or License,,...,,,,,2014,BYETTA,,,,
408283,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1477915.0,08/15/2014,1,Royalty or License,,...,,,,,2014,BYETTA,,,,
408284,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1627835.0,11/13/2014,1,Royalty or License,,...,,,,,2014,BYETTA,,,,
678860,UNCHANGED,64469.0,"Dexcom, Inc.",100000010478,"Dexcom, Inc.",324344.0,06/03/2015,1,Consulting Fee,BOARD OF DIRECTORS - MEMBER COMPENSATION,...,,,,,2015,,,,,
722202,UNCHANGED,663788.0,AstraZeneca Pharmaceuticals LP,100000000146,AstraZeneca Pharmaceuticals LP,1407050.0,02/15/2015,1,Royalty or License,,...,,,,,2015,BYETTA,,,,


In [None]:
dfg_check_speak = dfg[dfg['Nature_of_Payment_or_Transfer_of_Value'] == ""]
dfg_check_speak

In [None]:
dfg[dfg['Contextual_Information'].str.contains('company acquisition', na=False)].sort_values(by=['Total_Amount_of_Payment_USDollars'], ascending=False)

Unnamed: 0,Change_Type,Covered_Recipient_Profile_ID,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Total_Amount_of_Payment_USDollars,Date_of_Payment,Number_of_Payments_Included_in_Total_Amount,Nature_of_Payment_or_Transfer_of_Value,Contextual_Information,Dispute_Status_for_Publication,Associated_Drug_or_Biological_NDC_1,Associated_Drug_or_Biological_NDC_2,Associated_Drug_or_Biological_NDC_3,Associated_Drug_or_Biological_NDC_4,Associated_Drug_or_Biological_NDC_5,Program_Year,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5,month,day,year,cmonth,category,category2,NDC1,NDC2,NDC3,NDC4,NDC5,product_count,product_payment


In [None]:
dfg["Number_of_Payments_Included_in_Total_Amount"].sum()

2381195

In [None]:
dfg.groupby(['Dispute_Status_for_Publication'])["Number_of_Payments_Included_in_Total_Amount"].sum()

Dispute_Status_for_Publication
No     2381004
Yes        191
Name: Number_of_Payments_Included_in_Total_Amount, dtype: int64

In [22]:
dfg.groupby(['Dispute_Status_for_Publication', 'Program_Year'])["Number_of_Payments_Included_in_Total_Amount"].sum()

Dispute_Status_for_Publication  Program_Year
No                              2013            107422
                                2014            310955
                                2015            344719
                                2016            346246
                                2017            336243
                                2018            305537
                                2019            284445
                                2020            161323
                                2021            184114
Yes                             2013                18
                                2014                77
                                2015                 6
                                2016                 5
                                2017                18
                                2018                28
                                2019                13
                                2020                25
                    

In [None]:
dfg.groupby(['Dispute_Status_for_Publication', 'Change_Type'])["Number_of_Payments_Included_in_Total_Amount"].sum()

Dispute_Status_for_Publication  Change_Type
No                              ADD                918
                                CHANGED           1166
                                NEW             185125
                                UNCHANGED      2193795
Yes                             NEW                  1
                                UNCHANGED          190
Name: Number_of_Payments_Included_in_Total_Amount, dtype: int64

In [None]:
dfg.groupby(['Contextual_Information'])["Number_of_Payments_Included_in_Total_Amount"].sum()

Contextual_Information
.                                                                       1
02953.01-HF-071716                                                      1
1 Consulting Fee                                                        2
1 hour product feedback call with marketing for next gen. dxa unit      1
1 meal/break                                                          100
                                                                     ... 
inservice                                                               1
loaner product provided for more than 90 days                           1
referral lunch Dr Walsh Dr guleria Palm Springs                         1
referral lunch for Dr Dicks with Hillcrest Internal Medicine 20         1
review of mens health portfolio                                         1
Name: Number_of_Payments_Included_in_Total_Amount, Length: 17392, dtype: int64

In [None]:
dfg['Total_Amount_of_Payment_USDollars'].sum()

542388438.3100001

In [None]:
dfg.groupby('Program_Year')['Total_Amount_of_Payment_USDollars'].sum()

Program_Year
2013    27706754.05
2014    76939432.98
2015    81674817.29
2016    76680864.76
2017    85732734.96
2018    69980399.59
2019    60144059.61
2020    31679796.05
2021    31849579.02
Name: Total_Amount_of_Payment_USDollars, dtype: float64

In [None]:
dfg.groupby('Program_Year')['Number_of_Payments_Included_in_Total_Amount'].sum()

Program_Year
2013    107440
2014    311032
2015    344725
2016    346251
2017    336261
2018    305565
2019    284458
2020    161348
2021    184115
Name: Number_of_Payments_Included_in_Total_Amount, dtype: int64

In [None]:
dfg['Total_Amount_of_Payment_USDollars'].quantile([0.25, 0.5, 0.75])

0.25    11.96
0.50    16.69
0.75    34.99
Name: Total_Amount_of_Payment_USDollars, dtype: float64

In [None]:
round(dfg['Total_Amount_of_Payment_USDollars'].mean(),2)

234.66

In [None]:
round(dfg['Total_Amount_of_Payment_USDollars'].std(),2)

4203.09

In [None]:
round(dfg.groupby(['Program_Year'])['Total_Amount_of_Payment_USDollars'].quantile([0.25, 0.5, 0.75]),2)

Program_Year      
2013          0.25    11.86
              0.50    16.91
              0.75    71.50
2014          0.25    11.60
              0.50    16.71
              0.75    59.52
2015          0.25    11.31
              0.50    16.24
              0.75    40.00
2016          0.25    11.30
              0.50    16.10
              0.75    30.00
2017          0.25    11.90
              0.50    16.80
              0.75    50.00
2018          0.25    12.17
              0.50    16.67
              0.75    35.97
2019          0.25    12.16
              0.50    16.63
              0.75    30.00
2020          0.25    13.39
              0.50    17.50
              0.75    24.50
2021          0.25    13.26
              0.50    17.39
              0.75    23.55
Name: Total_Amount_of_Payment_USDollars, dtype: float64

In [None]:
dfg.groupby(['Program_Year'])['Total_Amount_of_Payment_USDollars'].mean()

Program_Year
2013    261.248918
2014    254.383916
2015    242.915701
2016    229.461826
2017    263.559476
2018    235.935644
2019    217.738122
2020    202.838970
2021    178.784580
Name: Total_Amount_of_Payment_USDollars, dtype: float64

In [None]:
dfr.groupby('Program_Year')['Total_Amount_of_Payment_USDollars'].count()

Program_Year
2013    12338
2014    41902
2015    48120
2016    38084
2017    31430
2018    27559
2019    18923
2020    30475
2021    26081
Name: Total_Amount_of_Payment_USDollars, dtype: int64

In [None]:
dfr.groupby('Program_Year')['Total_Amount_of_Payment_USDollars'].count()

Program_Year
2013    12338
2014    41902
2015    48120
2016    38084
2017    31430
2018    27559
2019    18923
2020    30475
2021    26081
Name: Total_Amount_of_Payment_USDollars, dtype: int64

## Associated research payments

In [None]:
round(dfr.agg({'Total_Amount_of_Payment_USDollars': ['sum','count']}),2)

Unnamed: 0,Total_Amount_of_Payment_USDollars
sum,1131153000.0
count,274912.0


In [None]:
round(dfr['Total_Amount_of_Payment_USDollars'].sum(), 2)

1131152705.74

In [None]:
dfr

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Covered_Recipient_Profile_ID,Covered_Recipient_First_Name,Covered_Recipient_Middle_Name,Covered_Recipient_Last_Name,Covered_Recipient_Name_Suffix,Recipient_Primary_Business_Street_Address_Line1,Recipient_Primary_Business_Street_Address_Line2,Recipient_City,Recipient_State,Recipient_Zip_Code,Recipient_Country,Recipient_Province,Recipient_Postal_Code,Covered_Recipient_Primary_Type_1,Covered_Recipient_Specialty_1,Covered_Recipient_License_State_code1,Covered_Recipient_License_State_code2,Covered_Recipient_License_State_code3,Covered_Recipient_License_State_code4,Covered_Recipient_License_State_code5,Principal_Investigator_1_Profile_ID,Principal_Investigator_1_First_Name,Principal_Investigator_1_Middle_Name,Principal_Investigator_1_Last_Name,Principal_Investigator_1_Name_Suffix,Principal_Investigator_1_Business_Street_Address_Line1,Principal_Investigator_1_Business_Street_Address_Line2,Principal_Investigator_1_City,Principal_Investigator_1_State,Principal_Investigator_1_Zip_Code,Principal_Investigator_1_Country,Principal_Investigator_1_Province,Principal_Investigator_1_Postal_Code,Principal_Investigator_1_Primary_Type_1,Principal_Investigator_1_Specialty_1,Principal_Investigator_1_License_State_code1,Principal_Investigator_1_License_State_code2,Principal_Investigator_1_License_State_code3,Principal_Investigator_1_License_State_code4,Principal_Investigator_1_License_State_code5,Principal_Investigator_2_Profile_ID,Principal_Investigator_2_First_Name,Principal_Investigator_2_Middle_Name,Principal_Investigator_2_Last_Name,Principal_Investigator_2_Name_Suffix,Principal_Investigator_2_Business_Street_Address_Line1,Principal_Investigator_2_Business_Street_Address_Line2,Principal_Investigator_2_City,Principal_Investigator_2_State,Principal_Investigator_2_Zip_Code,Principal_Investigator_2_Country,Principal_Investigator_2_Province,Principal_Investigator_2_Postal_Code,Principal_Investigator_2_Primary_Type_1,Principal_Investigator_2_Specialty_1,Principal_Investigator_2_License_State_code1,Principal_Investigator_2_License_State_code2,Principal_Investigator_2_License_State_code3,Principal_Investigator_2_License_State_code4,Principal_Investigator_2_License_State_code5,Principal_Investigator_3_Profile_ID,Principal_Investigator_3_First_Name,Principal_Investigator_3_Middle_Name,Principal_Investigator_3_Last_Name,Principal_Investigator_3_Name_Suffix,Principal_Investigator_3_Business_Street_Address_Line1,Principal_Investigator_3_Business_Street_Address_Line2,Principal_Investigator_3_City,Principal_Investigator_3_State,Principal_Investigator_3_Zip_Code,Principal_Investigator_3_Country,Principal_Investigator_3_Province,Principal_Investigator_3_Postal_Code,Principal_Investigator_3_Primary_Type_1,Principal_Investigator_3_Specialty_1,Principal_Investigator_3_License_State_code1,Principal_Investigator_3_License_State_code2,Principal_Investigator_3_License_State_code3,Principal_Investigator_3_License_State_code4,Principal_Investigator_3_License_State_code5,Principal_Investigator_4_Profile_ID,Principal_Investigator_4_First_Name,Principal_Investigator_4_Middle_Name,Principal_Investigator_4_Last_Name,Principal_Investigator_4_Name_Suffix,Principal_Investigator_4_Business_Street_Address_Line1,Principal_Investigator_4_Business_Street_Address_Line2,Principal_Investigator_4_City,Principal_Investigator_4_State,...,Covered_Recipient_Specialty_3,Covered_Recipient_Specialty_4,Covered_Recipient_Specialty_5,Covered_Recipient_Specialty_6,Principal_Investigator_1_Covered_Recipient_Type,Principal_Investigator_1_Primary_Type_2,Principal_Investigator_1_Primary_Type_3,Principal_Investigator_1_Primary_Type_4,Principal_Investigator_1_Primary_Type_5,Principal_Investigator_1_Primary_Type_6,Principal_Investigator_1_Specialty_2,Principal_Investigator_1_Specialty_3,Principal_Investigator_1_Specialty_4,Principal_Investigator_1_Specialty_5,Principal_Investigator_1_Specialty_6,Principal_Investigator_2_Covered_Recipient_Type,Principal_Investigator_2_Primary_Type_2,Principal_Investigator_2_Primary_Type_3,Principal_Investigator_2_Primary_Type_4,Principal_Investigator_2_Primary_Type_5,Principal_Investigator_2_Primary_Type_6,Principal_Investigator_2_Specialty_2,Principal_Investigator_2_Specialty_3,Principal_Investigator_2_Specialty_4,Principal_Investigator_2_Specialty_5,Principal_Investigator_2_Specialty_6,Principal_Investigator_3_Covered_Recipient_Type,Principal_Investigator_3_Primary_Type_2,Principal_Investigator_3_Primary_Type_3,Principal_Investigator_3_Primary_Type_4,Principal_Investigator_3_Primary_Type_5,Principal_Investigator_3_Primary_Type_6,Principal_Investigator_3_Specialty_2,Principal_Investigator_3_Specialty_3,Principal_Investigator_3_Specialty_4,Principal_Investigator_3_Specialty_5,Principal_Investigator_3_Specialty_6,Principal_Investigator_4_Covered_Recipient_Type,Principal_Investigator_4_Primary_Type_2,Principal_Investigator_4_Primary_Type_3,Principal_Investigator_4_Primary_Type_4,Principal_Investigator_4_Primary_Type_5,Principal_Investigator_4_Primary_Type_6,Principal_Investigator_4_Specialty_2,Principal_Investigator_4_Specialty_3,Principal_Investigator_4_Specialty_4,Principal_Investigator_4_Specialty_5,Principal_Investigator_4_Specialty_6,Principal_Investigator_5_Covered_Recipient_Type,Principal_Investigator_5_Primary_Type_2,Principal_Investigator_5_Primary_Type_3,Principal_Investigator_5_Primary_Type_4,Principal_Investigator_5_Primary_Type_5,Principal_Investigator_5_Primary_Type_6,Principal_Investigator_5_Specialty_2,Principal_Investigator_5_Specialty_3,Principal_Investigator_5_Specialty_4,Principal_Investigator_5_Specialty_5,Principal_Investigator_5_Specialty_6,Covered_or_Noncovered_Indicator_1,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1,Product_Category_or_Therapeutic_Area_1,Associated_Device_or_Medical_Supply_PDI_1,Covered_or_Noncovered_Indicator_2,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_2,Product_Category_or_Therapeutic_Area_2,Associated_Device_or_Medical_Supply_PDI_2,Covered_or_Noncovered_Indicator_3,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3,Product_Category_or_Therapeutic_Area_3,Associated_Device_or_Medical_Supply_PDI_3,Covered_or_Noncovered_Indicator_4,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4,Product_Category_or_Therapeutic_Area_4,Associated_Device_or_Medical_Supply_PDI_4,Covered_or_Noncovered_Indicator_5,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5,Product_Category_or_Therapeutic_Area_5,Associated_Device_or_Medical_Supply_PDI_5,PI_count,per_payment,PI1,PI2,PI3,PI4,PI5,PI_OPDcount,PI_OPD_per_payment,PI,payment_DMdoc,NDC1,NDC2,NDC3,product_count,product_payment
0,UNCHANGED,Covered Recipient Teaching Hospital,0,70033.0,46.0,DANBURY HOSPITAL,0.0,0,0,0,0,25 GERMANTOWN RD,0,DANBURY,CT,06810,United States,0,0,0,0,0,0.0,0.0,0.0,0.0,525710.0,ROBERT,0,SAVINO,0,25 GERMANTOWN RD,STE 1A,DANBURY,CT,06810,United States,0,0,Doctor of Osteopathy,Allopathic & Osteopathic Physicians|Family Med...,CT,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,5,25.74,True,False,False,False,False,025 GERMANTOWN RDSTE 1ADANBURY,128.7,1,128.7,0169266015,0,0,5,25.74
1,UNCHANGED,Covered Recipient Teaching Hospital,0,374000.0,904.0,State Of Oklahoma,0.0,0,0,0,0,4444 E 41ST ST,0,TULSA,OK,74135,United States,0,0,0,0,0,0.0,0.0,0.0,0.0,239459.0,JONEA,0,LIM,0,1000 N LINCOLN BLVD,0,OKLAHOMA CITY,OK,73104,United States,0,0,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,OK,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,5,13.50,True,False,False,False,False,01000 N LINCOLN BLVD0OKLAHOMA CITY,67.5,1,67.5,0169266015,0,0,5,13.50
2,UNCHANGED,Covered Recipient Teaching Hospital,0,450647.0,811.0,Columbia Hospital At Medical City Dallas Subsi...,0.0,0,0,0,0,7777 Forest Ln,0,Dallas,TX,75230,United States,0,0,0,0,0,0.0,0.0,0.0,0.0,174102.0,JULIO,0,ROSENSTOCK,0,7777 FOREST LN,SUITE C-685,DALLAS,TX,75230,United States,0,0,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,TX,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,5,500.00,True,False,False,False,False,07777 FOREST LNSUITE C-685DALLAS,2500.0,1,2500.0,5045814001,0,0,5,500.00
3,UNCHANGED,Covered Recipient Teaching Hospital,0,300003.0,5.0,Mary Hitchcock Memorial Hospital,0.0,0,0,0,0,1 MEDICAL CENTER DR,0,LEBANON,NH,03756,United States,0,0,0,0,0,0.0,0.0,0.0,0.0,300734.0,SUSHELA,S,CHAIDARUN,0,8 COLLEGE HL 8,0,HANOVER,NH,03755,United States,0,0,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,NH,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,5,89.54,True,False,False,False,False,08 COLLEGE HL 80HANOVER,447.7,1,447.7,0169406090,0,0,5,89.54
4,UNCHANGED,Covered Recipient Teaching Hospital,0,260162.0,593.0,Barnes Jewish West County Hospital,0.0,0,0,0,0,12634 OLIVE BLVD,0,CREVE COEUR,MO,63141,United States,0,0,0,0,0,0.0,0.0,0.0,0.0,1023685.0,ROBERTO,0,CIVITELLI,0,1020 N MASON RD,0,SAINT LOUIS,MO,63141,United States,0,0,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,MO,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,5,16.80,True,False,False,False,False,01020 N MASON RD0SAINT LOUIS,84.0,1,84.0,0,0,0,5,16.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274907,NEW,Non-covered Recipient Entity,ST LOUIS UNIV,0.0,0.0,0,0.0,0,0,0,0,3700 W PINE MALL,0,SAINT LOUIS,MO,63108,United States,0,0,0,0,0,0.0,0.0,0.0,0.0,644735.0,SANDEEP,S,DHINDSA,0,1225 S GRAND BLVD,# 2,SAINT LOUIS,MO,63104,United States,0,0,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,MO,TX,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,Covered Recipient Physician,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Covered,Drug,Diabetes,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,5,201.00,True,False,False,False,False,01225 S GRAND BLVD# 2SAINT LOUIS,1005.0,1,1005.0,0169413212,0,0,5,201.00
274908,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,0.0,0.0,0,0.0,0,0,0,0,5323 HARRY HINES BLVD,STOP 9302,DALLAS,TX,75390,United States,0,0,0,0,0,0.0,0.0,0.0,0.0,489709.0,ILDIKO,0,LINGVAY,0,5323 HARRY HINES BLVD,STOP 8858,DALLAS,TX,75390,United States,0,0,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,TX,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,Covered Recipient Physician,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Covered,Drug,Diabetes,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,5,1.80,True,False,False,False,False,05323 HARRY HINES BLVDSTOP 8858DALLAS,9.0,1,9.0,0169413212,0,0,5,1.80
274909,NEW,Non-covered Recipient Entity,UT SOUTHWESTERN CLINICAL TRIALS,0.0,0.0,0,0.0,0,0,0,0,5323 HARRY HINES BLVD,STOP 9302,DALLAS,TX,75390,United States,0,0,0,0,0,0.0,0.0,0.0,0.0,489709.0,ILDIKO,0,LINGVAY,0,5323 HARRY HINES BLVD,STOP 8858,DALLAS,TX,75390,United States,0,0,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,TX,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,Covered Recipient Physician,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Covered,Drug,Diabetes,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,5,1.06,True,False,False,False,False,05323 HARRY HINES BLVDSTOP 8858DALLAS,5.3,1,5.3,0169430313,0,0,5,1.06
274910,NEW,Non-covered Recipient Entity,FOUNDATION FOR ADVANCING VETERANS' HEALTH RESE...,0.0,0.0,0,0.0,0,0,0,0,7400 Merton Minter Blvd,Bartter Research Unit,San Antonio,TX,78229,United States,0,0,0,0,0,0.0,0.0,0.0,0.0,309798.0,DEVJIT,0,TRIPATHY,0,701 S ZARZAMORA ST,0,SAN ANTONIO,TX,78207,United States,0,0,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,TX,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,Covered Recipient Physician,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Non-Covered,Drug,Diabetes,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,5,11.38,True,False,False,False,False,0701 S ZARZAMORA ST0SAN ANTONIO,56.9,1,56.9,0,0,0,5,11.38


In [None]:
round(dfr['PI_OPD_per_payment'].sum(), 2)

1105404960.77

In [None]:
round(dfr['PI_OPD_per_payment'].count(),0)

274912

In [None]:
round(dfr.groupby('Program_Year')['PI_OPD_per_payment'].count(), 2)

Program_Year
2013    12338
2014    41902
2015    48120
2016    38084
2017    31430
2018    27559
2019    18923
2020    30475
2021    26081
Name: PI_OPD_per_payment, dtype: int64

In [None]:
round(dfr.groupby('Preclinical_Research_Indicator').agg({'Total_Amount_of_Payment_USDollars': ['sum','count']}),2)

Unnamed: 0_level_0,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars
Unnamed: 0_level_1,sum,count
Preclinical_Research_Indicator,Unnamed: 1_level_2,Unnamed: 2_level_2
No,1079610000.0,274388
Yes,51543030.0,524


In [None]:
round(dfr.groupby('Preclinical_Research_Indicator').agg({'PI_OPD_per_payment': ['sum','count']}),2)

Unnamed: 0_level_0,PI_OPD_per_payment,PI_OPD_per_payment
Unnamed: 0_level_1,sum,count
Preclinical_Research_Indicator,Unnamed: 1_level_2,Unnamed: 2_level_2
No,1062472000.0,274388
Yes,42933150.0,524


In [None]:
clitri = dfr.groupby('ClinicalTrials_Gov_Identifier').agg({'PI_OPD_per_payment': ['sum','count']})
clitri['index', 'id'] = clitri.index
clitri

Unnamed: 0_level_0,PI_OPD_per_payment,PI_OPD_per_payment,index
Unnamed: 0_level_1,sum,count,id
ClinicalTrials_Gov_Identifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,8.692108e+08,235804,0
CIMRD001469,4.098948e+04,13,CIMRD001469
CIMRD005914,1.165589e+05,2,CIMRD005914
NCT00087984,5.335370e+03,1,NCT00087984
NCT00127634,1.149600e+02,1,NCT00127634
...,...,...,...
NCT05051579,3.768782e+04,8,NCT05051579
NCT05080660,2.398350e+03,3,NCT05080660
NCT05086289,8.840500e+02,1,NCT05086289
NCT05127486,1.951600e+02,1,NCT05127486


In [None]:
clitri.to_csv(path_out2 + "research_Cli.csv", index=None)

In [None]:
dfr["Total_Amount_of_Payment_USDollars"].count()

274912

In [None]:
dfr.groupby(['Dispute_Status_for_Publication'])["Total_Amount_of_Payment_USDollars"].count()

Dispute_Status_for_Publication
No     274613
Yes       299
Name: Total_Amount_of_Payment_USDollars, dtype: int64

In [None]:
dfr.groupby(['Dispute_Status_for_Publication', 'Change_Type'])["PI_OPD_per_payment"].count()

Dispute_Status_for_Publication  Change_Type
No                              ADD             15749
                                CHANGED            17
                                NEW             27884
                                UNCHANGED      230963
Yes                             UNCHANGED         299
Name: PI_OPD_per_payment, dtype: int64

## Direct research payments

In [None]:
dfrd["Total_Amount_of_Payment_USDollars"].count()

12395

In [None]:
dfrd.groupby(['Dispute_Status_for_Publication', 'Change_Type'])["Total_Amount_of_Payment_USDollars"].count()

Dispute_Status_for_Publication  Change_Type
No                              ADD              428
                                NEW              955
                                UNCHANGED      11008
Yes                             UNCHANGED          4
Name: Total_Amount_of_Payment_USDollars, dtype: int64

In [None]:
dfrd['Total_Amount_of_Payment_USDollars'].count()

12395

In [None]:
dfrd.groupby('Program_Year')['Total_Amount_of_Payment_USDollars'].count()

Program_Year
2013    1634
2014    1170
2015    1183
2016    1681
2017    1632
2018    1477
2019    1173
2020    1670
2021     775
Name: Total_Amount_of_Payment_USDollars, dtype: int64

# Import dataset

In [8]:
usecols = ['Covered_Recipient_Profile_ID', 'Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name', 
           'Change_Type', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name',
           'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 'Total_Amount_of_Payment_USDollars', 
           'Date_of_Payment', 'Number_of_Payments_Included_in_Total_Amount', 
           'Nature_of_Payment_or_Transfer_of_Value', 'Dispute_Status_for_Publication', 
           'Associated_Drug_or_Biological_NDC_1', 'Associated_Drug_or_Biological_NDC_2', 
           'Associated_Drug_or_Biological_NDC_3', 'Associated_Drug_or_Biological_NDC_4', 
           'Associated_Drug_or_Biological_NDC_5', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3', 
           'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5', 
           'Program_Year', 'Contextual_Information']

In [9]:
dfg = pd.read_csv(BASE + '/' + specialty + "/general/full_general payments dataset2013-2021.csv", 
                    low_memory=False, chunksize = 10000000, usecols= usecols)

In [10]:
dfg = pd.concat((r for r in dfg), ignore_index=True)

In [None]:
dfr = pd.read_csv(BASE + '/' + specialty + "/research/full_research payments dataset2013-2021.csv", 
                    low_memory=False)

In [None]:
dfrd = pd.read_csv(BASE + '/' + specialty + "/research/full_direct research payments dataset2013-2021.csv", 
                     low_memory=False)

In [None]:
dfo = pd.read_csv(BASE + '/' + specialty + "/ownership/ownership_interest2013-2021_extract.csv",
                    low_memory=False )

In [None]:
df_id = pd.read_csv(BASE + '/' + specialty + "/profile/matched physician profile data.csv",
                 low_memory=False )

In [None]:
npi_specialty = pd.read_csv(BASE + '/' + specialty + "/profile/NPI physician profile data.csv",
                 low_memory=False )

In [None]:
list_id = list(df_id["covered_recipient_profile_id"])

# Extract certification data from ABIM

In [None]:
!pip install webdriver

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement webdriver (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for webdriver[0m[31m
[0m

In [None]:
!pip install selenium

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

In [None]:
!apt-get update
!apt install chromium-chromedriver

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:14 http://security.ubuntu.com/ubuntu bionic-sec

In [None]:
import requests
from bs4 import BeautifulSoup

npi = "1003011800"
url = "https://www.abim.org/verify-physician?type=npi&npi="+npi 

r= requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

print(soup.find("ul", "abim_voc-profile__certifications").text)


Endocrinology, Diabetes and Metabolism: Certified 
Participating in Maintenance of Certification: Yes



In [None]:
elems = soup.select('#verifyPhysicianForm > form > div.abim_voc-profile > ul.abim_voc-profile__certifications.abim_voc-profile_certifications--initial')  
elems[0].contents[3]

<li><span class="certification-name">Endocrinology, Diabetes and Metabolism</span>: 2007 </li>

In [None]:
soup.find_all('span', class_ = 'certification-name')

[<span class="certification-name">Endocrinology, Diabetes and Metabolism</span>,
 <span class="certification-name">Internal Medicine</span>,
 <span class="certification-name">Endocrinology, Diabetes and Metabolism</span>]

In [None]:
<ul class="abim_voc-profile__certifications">
                                    <li><span class="certification-name">Endocrinology, Diabetes and Metabolism</span>: <b><i>Certified</i></b> </li>
                                    <li><br><p>Participating in Maintenance of Certification: <b>Yes</b></p></li>
                        </ul>

In [None]:
<ul class="abim_voc-profile__certifications abim_voc-profile_certifications--initial">
                                <li><span class="certification-name">Internal Medicine</span>: 2005 </li>
                                <li><span class="certification-name">Endocrinology, Diabetes and Metabolism</span>: 2010 </li>
                        </ul>

In [None]:
#verifyPhysicianForm > form > div.abim_voc-profile > ul.abim_voc-profile__certifications.abim_voc-profile_certifications--initial

In [None]:
! pip install beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
executable_path = '/usr/lib/chromium-browser/chromedriver'
options = webdriver.ChromeOptions()
options.add_argument('--disable-extensions')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--remote-debugging-port=9222')
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-browser-side-navigation")
options.add_argument("--disable-gpu")
options.add_argument("--disable-setuid-sandbox")
browser = webdriver.Chrome(chrome_options=options, executable_path = executable_path)

  browser = webdriver.Chrome(chrome_options=options, executable_path=executable_path)
  browser = webdriver.Chrome(chrome_options=options, executable_path=executable_path)


In [None]:
from selenium.webdriver.common.by import By

In [None]:
url = "https://www.abim.org/verify-physician"
browser.get(url)
time.sleep(3)

In [None]:
from selenium import webdriver

In [None]:
# Chrome driver example
from selenium import webdriver
from selenium.webdriver.common.by import By

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-extensions')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--remote-debugging-port=9222')

# pass the options to the browser
browser = webdriver.Chrome(options=options)
npi = "1003011800"
browser.get("https://www.abim.org/verify-physician?type=npi&npi="+npi)
time.sleep(3)

from selenium.webdriver.common.by import By
form_elements = browser.find_elements(By.TAG_NAME, "form")

for form in form_elements:
    print(form.get_attribute("innerHTML"))


        <div class="constrain wrapper--padded">
            <label for="search-box-input">
                <i class="fa fa-search" aria-hidden="true"></i><span class="sr-only">Enter search terms and press enter to submit</span>
            </label>
            <input id="search-box-input" type="text" name="q" placeholder="Enter search terms..." tabindex="-1">
            <button id="search-box-close" type="button" tabindex="-1"><i class="fa fa-angle-right" aria-hidden="true"></i><i class="fa fa-angle-left" aria-hidden="true"></i><span class="sr-only">Close the Search Box</span></button>
        </div>
    
                <div class="abim_tool-module abim_tool-module--orange abim_tool-module--static abim_tool-module--open float-right">
                    <h2 class="abim_tool-module__title"><a href="#">Check a Physician's Certification</a></h2>
                    <div class="abim_tool-module__content abim_tool-module__content--voc">
                        <div class="form-group form

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
wait = WebDriverWait(browser, 10)
npi_input = wait.until(EC.presence_of_element_located((By.NAME, "npi")))
npi_input.send_keys("1003011800")
form_elements[0].submit()

In [None]:
npi_input

<selenium.webdriver.remote.webelement.WebElement (session="d4f63742bff9072a5fceea882d72d16b", element="d5531c49-e5fa-49d4-8026-977cd61b01d0")>

In [None]:
npi_input

<selenium.webdriver.remote.webelement.WebElement (session="d4f63742bff9072a5fceea882d72d16b", element="d5531c49-e5fa-49d4-8026-977cd61b01d0")>

In [None]:
scraped_data = browser.find_elements(By.XPATH, '//*[@class="certification-status"]')
print(len(scraped_data))
for element in scraped_data:
    print(element.text)

0


In [None]:
status = browser.find_elements(By.XPATH, '//*[@class="certification-status"]')

In [None]:
scraped_data = browser.find_elements(By.XPATH, '//*[@class="certification-status"]')
print(len(scraped_data))


0


In [None]:
#Locate the element containing the physician name 
name = browser.find_element_by_css_selector(".physician-name").text

#Locate the element containing the physician's certification status
status = browser.find_elements_by_css_selector(".certification-status")[0].text


AttributeError: ignored

In [None]:
from bs4 import BeautifulSoup

# extract the page source
html_source = browser.page_source

# parse the page source using BeautifulSoup
soup = BeautifulSoup(html_source, 'html.parser')

# locate the element containing the physician name and extract the text
name = soup.select_one(".physician-name").text

# locate the element containing the physician's certification status and extract the text
status = soup.select(".certification-status")[0].text
