In [1]:
import pandas as pd
from trino.dbapi import connect 
import numpy as np
from dateutil.relativedelta import relativedelta
from scipy.stats import gmean

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def read_from_hive(sql_script):
    conn = connect(
            host='presto.bstis.com',
            port=8080,
            user='hadoop',
            catalog='hive',
            #schema='default'
            )
    cur = conn.cursor()
    cur.execute(sql_script)
    rows = cur.fetchall()
    columns = [desc[0] for desc in cur.description]
    df = pd.DataFrame(rows,columns=columns)
    return df

In [3]:
Top20 = pd.read_csv('Outputs/Top20Systems.csv', dtype={'billingprovidertaxid': str})
ZipMSA = pd.read_csv('Data Sources/ZipMSA.txt', sep='\t', dtype={'zipcode': str})

  Top20 = pd.read_csv('Outputs/Top20Systems.csv', dtype={'billingprovidertaxid': str})


In [4]:
tins_list = Top20['billingprovidertaxid'].unique()
tins_list = ', '.join(f"'{tin}'" for tin in tins_list)

# Pull Claims Data

In [5]:

    sSQL = f'''
        SELECT billingprovidername,
           billingprovidertaxid,
           billingprovidernpi,
           billingproviderid,
           billingproviderzip,
--           tenantid,
--           personid,
--           claimid,
           tpaclaimid,
           dosstart,
           dosend,
           innetworkflag,
           proceduretype,
           procedurecode,
           drg_code,
           drg_code_description,
           servicecategory_details,
           locationtype,
           facility_indicator,
           sum(amtbilled) as amtbilled,
           sum(amtcovered) as amtcovered,
           sum(amtallowed) as amtallowed
        FROM hive.bcbstx_nonev_prod.claims 
        WHERE  dosplanyear = '2023'
        AND   locationtype = '21'
        AND innetworkflag = 1
        and drg_code <> ''
        and facility_indicator = 'Facility'
        and billingprovidertaxid in ({tins_list})
        group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
    '''.format(tins_list=tins_list)

    df_claims = read_from_hive(sSQL)
    df_claims.to_csv(f'Data Sources/Claims_nonev_ip.gz', index=False, compression='gzip')

        
    

In [6]:
    sSQL = f'''
        SELECT billingprovidername,
           billingprovidertaxid,
           billingprovidernpi,
           billingproviderid,
           billingproviderzip,
--           tenantid,
--           personid,
--           claimid,
           tpaclaimid,
           dosstart,
           dosend,
           innetworkflag,
           proceduretype,
           procedurecode,
           drg_code,
           drg_code_description,
           servicecategory_details,
           locationtype,
           facility_indicator,
           sum(amtbilled) as amtbilled,
           sum(amtcovered) as amtcovered,
           sum(amtallowed) as amtallowed
        FROM hive.allbcbstxev.claims 
        WHERE  dosplanyear = '2023'
        AND   locationtype = '21'
        AND innetworkflag = 1
        and drg_code <> ''
        and facility_indicator = 'Facility'
        and billingprovidertaxid in ({tins_list})
        group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
    '''.format(tins_list=tins_list)

    df_claims = read_from_hive(sSQL)
    df_claims.to_csv(f'Data Sources/Claims_ev_ip.gz', index=False, compression='gzip')

# Inpatient Facility

In [5]:
df_claims_nonev = pd.read_csv('Data Sources/Claims_nonev_ip.gz', compression='gzip', low_memory=False, dtype={'billingproviderzip': str,
                                     'billingprovidertaxid': str,
                                     'billingprovidernpi': str,
                                     'locationtype': str,
                                     'innetworkflag': str,
                                     'tpaclaimid': str,
                                     'drg_code':str,
                                     'drg_code_description':str})

df_claims_ev = pd.read_csv('Data Sources/Claims_ev_ip.gz', compression='gzip', low_memory=False, dtype={'billingproviderzip': str,
                                     'billingprovidertaxid': str,
                                     'billingprovidernpi': str,
                                     'locationtype': str,
                                     'innetworkflag': str,
                                     'tpaclaimid': str,
                                     'drg_code':str,
                                     'drg_code_description':str})

df_claims = pd.concat([df_claims_nonev, df_claims_ev])
# df_claims = df_claims[(df_claims['billingprovidertaxid'] == '741100555') & (df_claims['billingprovidernpi'] == '1477643690') & (df_claims['facility_indicator'] == 'Facility') & (df_claims['drg_code'] == '790')]
# claims = df_claims['tpaclaimid'].nunique()
# claims

In [6]:
df_claims = df_claims.merge(ZipMSA[['zipcode', 'state', 'cbsa_name']], left_on='billingproviderzip', right_on='zipcode', how='left')

df_claims_ip = df_claims[df_claims.state == 'TX']
df_claims_ip.billingprovidertaxid = df_claims_ip.billingprovidertaxid.fillna(0).apply( lambda x : 0 if x == '00000PROV' else x )
df_claims_ip.billingprovidertaxid = df_claims_ip.billingprovidertaxid.fillna(0).apply( lambda x : 0 if x == '' else int(x) )

# Filter and convert data type
df_claims_ip = df_claims_ip[(df_claims_ip.billingprovidertaxid != 0) & (df_claims_ip.billingprovidertaxid != 111111111) & (df_claims_ip.billingprovidertaxid != 999999999)]
df_claims_ip = df_claims_ip.astype({'billingprovidertaxid': str})


# Calculate length of stay
LOS = df_claims_ip.copy()
LOS['dosstart'] = pd.to_datetime(LOS['dosstart'])
LOS['dosend'] = pd.to_datetime(LOS['dosend'])
LOS['AvgLengthOfStay'] = (LOS['dosend'] - LOS['dosstart']).dt.days + 1
LOS = LOS.dropna(subset=['AvgLengthOfStay'])
LOS['AvgLengthOfStay'] = LOS.groupby('drg_code')['AvgLengthOfStay'].transform(lambda x: gmean(x))
LOS = LOS.drop_duplicates(subset=['drg_code'])
LOS = LOS.groupby(['drg_code']).agg({'AvgLengthOfStay':'mean'}).reset_index()

df_covered = df_claims_ip.groupby(['tpaclaimid']).agg({'amtcovered': 'sum'})
df_covered = df_covered[df_covered.amtcovered > 0]
df_claims_ip = df_claims_ip[df_claims_ip.tpaclaimid.isin(df_covered.index)]

# Group and aggregate
df_claims_ip = df_claims_ip.groupby(['billingprovidername', 'billingprovidertaxid','billingprovidernpi', 'billingproviderid', 'servicecategory_details', 'facility_indicator', 'cbsa_name', 'drg_code']).agg({'tpaclaimid': 'nunique', 'amtcovered':'sum', 'amtallowed': 'sum'}).reset_index()

df_claims_ip = df_claims_ip.rename(columns={'tpaclaimid': 'claimcount'})


# Sort, merge and query
df_claims_ip = df_claims_ip.sort_values('amtallowed', ascending = False)
df_claims_ip = df_claims_ip.merge(Top20[['IDN','rank','billingprovidertaxid']].drop_duplicates(), on='billingprovidertaxid', how='left')
# # df_claims_ip = df_claims_ip.query("`Contract/Corp Entity Name`.notnull() and `Facility Type` == 'Acute Care Gen'")
df_claims_ip = df_claims_ip[df_claims_ip['amtallowed'] > 0]
df_claims_ip = df_claims_ip.merge(LOS, on='drg_code', how='left')

df_claims_ip['weight'] = df_claims_ip['amtallowed'] / df_claims_ip['amtallowed'].sum()
df_claims_ip['cumsum'] = df_claims_ip['weight'].cumsum()
df_claims_ip['IDNWeight'] = df_claims_ip.groupby('IDN')['weight'].transform('sum')

# df_claims_ip = df_claims_ip[(df_claims_ip['billingprovidertaxid'] == '741100555') & (df_claims_ip['billingprovidernpi'] == '1477643690') & (df_claims_ip['facility_indicator'] == 'Facility') & (df_claims_ip['drg_code'] == '790')]
# df_claims_ip = df_claims_ip[df_claims_ip['rank'] == 1]
df_claims_ip


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_claims_ip.billingprovidertaxid = df_claims_ip.billingprovidertaxid.fillna(0).apply( lambda x : 0 if x == '00000PROV' else x )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_claims_ip.billingprovidertaxid = df_claims_ip.billingprovidertaxid.fillna(0).apply( lambda x : 0 if x == '' else int(x) )


Unnamed: 0,billingprovidername,billingprovidertaxid,billingprovidernpi,billingproviderid,servicecategory_details,facility_indicator,cbsa_name,drg_code,claimcount,amtcovered,amtallowed,IDN,rank,AvgLengthOfStay,weight,cumsum,IDNWeight
0,TEXAS CHILDREN'S HOSPITAL,741100555,1477643690,000H0HH022201,Inpatient - Hospital,Facility,"Houston-The Woodlands-Sugar Land, TX",790,87,42877027.02,26851271.69,Texas Childrens Hospital,9,14.617114,6.491317e-03,0.006491,0.044824
1,UNIVERSITY OF TEXAS M. D. ANDERSON CANCER CENTER,746001118,1174582050,000H0HH015701,Inpatient - Hospital,Facility,"Houston-The Woodlands-Sugar Land, TX",834,147,24762488.70,16089005.14,University of Texas Systems,4,13.457594,3.889530e-03,0.010381,0.050218
2,MEMORIAL HERMANN HEALTH SYSTEM,741152597,1982666111,000H0HH019601,Inpatient - Hospital,Facility,"Houston-The Woodlands-Sugar Land, TX",003,25,35270137.79,14991509.10,Memorial Hermann Hospital System,6,23.558717,3.624210e-03,0.014005,0.090547
3,THE METHODIST HOSPITAL,741180155,1548387418,000H0HH006601,Inpatient - Hospital,Facility,"Houston-The Woodlands-Sugar Land, TX",005,23,34844839.56,13869745.92,Methodist Hospital,5,21.677489,3.353022e-03,0.017358,0.092114
4,UNIVERSITY OF TEXAS M. D. ANDERSON CANCER CENTER,746001118,1174582050,000H0HH015701,Inpatient - Hospital,Facility,"Houston-The Woodlands-Sugar Land, TX",840,160,21751463.39,13640137.30,University of Texas Systems,4,8.717614,3.297514e-03,0.020656,0.050218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31649,"CR EMERGENCY ROOM, LLC",262978009,1619308319,000H0HH113D01,Inpatient - Hospital,Facility,"Dallas-Fort Worth-Arlington, TX",638,1,0.76,4.93,Baylor Scott and White Health,2,3.711052,1.191832e-09,1.000000,0.112514
31650,CHI ST. LUKE'S HEALTH BAYLOR COLLEGE OF MEDICI...,741161938,1184622847,000H0HH032601,Inpatient - Hospital,Facility,"Houston-The Woodlands-Sugar Land, TX",261,1,0.50,4.86,CHI St Lukes Health,14,5.051969,1.174909e-09,1.000000,0.024967
31651,CHI ST. LUKE'S HEALTH BAYLOR COLLEGE OF MEDICI...,741161938,1184622847,000H0HH032601,Inpatient - Hospital,Facility,"Houston-The Woodlands-Sugar Land, TX",514,1,1.75,4.57,CHI St Lukes Health,14,3.303407,1.104801e-09,1.000000,0.024967
31652,"METHODIST HEALTHCARE SYSTEM OF SAN ANTONIO, LT...",742730328,1124074273,000H0HH067901,Inpatient - Hospital,Facility,"San Antonio-New Braunfels, TX",694,1,3.37,3.56,HCA Healthcare,1,2.500851,8.606329e-10,1.000000,0.215576


In [7]:
del df_claims, df_claims_nonev, df_claims_ev, LOS

In [8]:
tins_list = df_claims_ip['billingprovidertaxid'].unique()
tins_list = ', '.join(f"'{tin}'" for tin in tins_list)

npi_list = df_claims_ip['billingprovidernpi'].unique()
npi_list = ', '.join(f"'{npi}'" for npi in npi_list)

### MRF Search Stage Rates

In [14]:
# sSQL = '''
# SELECT *
# FROM 		
# hive.mrf.mrf_search_stage	
# WHERE 		
# plan_group_alias in ('bcbs_tx_ppo','uhc_choice_plus', 'aetna_open_access_managed', 'cigna_national_oap')
# and taxid in ({tins})
# and npi in ({npi})
# and billing_code_type = 'MS-DRG'
# '''.format(tins=tins_list, npi=npi_list)
# 
# 
# mrf_search_stage = read_from_hive(sSQL)
# 
# del sSQL
# 
# mrf_search_stage.to_csv('search_stage_ip.gz',index=False, compression='gzip')

### INN Provider Rates

In [15]:
# sSQL = '''
# SELECT
#     npi,
#     tin_value as taxid, 
#     r.plan_group_alias,	
#     ltrim(billing_code,'0') as billing_code,
#     billing_code_type,
#     negotiated_type,
# 	negotiated_rate as negotiated_rate_cal   				
# 
# FROM 				
# 	mrf.mrf_in_network_rates r 			
# JOIN				
# 	(			
# 		SELECT		
# 			 tin_value,
# 			 npi,	
# 			 group_id,	
# 			 plan_group_alias	
# 		 FROM 		
# 			 mrf.mrf_provider_references	
# 		 WHERE 		
# 			 plan_group_alias in ('bcbs_tx_ppo', 'uhc_choice_plus','aetna_open_access_managed', 'cigna_national_oap')	 -- uhc_option_ppo
# 			 and tin_value in ({tins})
# 			 and npi in ({npi})
# 		 GROUP BY		
# 			 1,2,3,4	
# 	) n			
# ON 				
# 	n.group_id = r.provider_reference			
# 	 and n.plan_group_alias = r.plan_group_alias			
# WHERE				
# 	 r.plan_group_alias in ('bcbs_tx_ppo', 'uhc_choice_plus','aetna_open_access_managed', 'cigna_national_oap')		
#    --  and negotiated_type in ('percentage','per diem')
#      and billing_code_type in ('MS-DRG')
# 	 and negotiated_rate <> 0
#      and billing_class = 'institutional'
# --Group by 1,2,3,4,5,6
# order by 1, 2, 3
# '''.format(tins=tins_list, npi=npi_list)
# 
# df_in_network = read_from_hive(sSQL) 
# del sSQL
# 
# df_in_network.to_csv('in_network_rates_ip.gz',index=False, compression='gzip')

KeyboardInterrupt: 

### Outlier Data

In [46]:
# sSQL = '''
# SELECT *
# FROM 		
# hive.mrf.outlier_result
# WHERE 		
# providerstate = 'TX'
# and billing_class = 'institutional'
# and billing_code_type = 'MS-DRG'
# and outlier_algorithm <> 'mrf_step_benchmark'
# and billing_code <> '807'
# '''
# 
# 
# outlier_rates = read_from_hive(sSQL)
# 
# del sSQL
# 
# outlier_rates.to_csv('outlier_drg_rates.gz',index=False, compression='gzip')

In [9]:
mrf_search_stage = pd.read_csv('search_stage_ip.gz', compression='gzip', low_memory=False, dtype={'taxid': str, 'npi': str})
mrf_search_stage = mrf_search_stage[['npi','taxid', 'plan_group_alias', 'billing_code', 'billing_code_type','negotiated_type', 'negotiated_rate']]
mrf_search_stage.sort_values(by=['npi','taxid','billing_code','billing_code_type'], inplace=True)

# # Define a custom sort order
# sort_order = {'negotiated': 1, 'fee schedule': 2, 'percentage': 3, 'per diem': 4}
# 
# # Create a new column 'sort_order' based on the 'negotiated_type' column
# mrf_search_stage['sort_order'] = mrf_search_stage['negotiated_type'].map(sort_order)
# 
# # Find duplicates
# duplicates = mrf_search_stage.duplicated(subset=['npi', 'taxid', 'plan_group_alias', 'billing_code', 'billing_code_type'], keep='first')
# 
# # Keep only non-duplicates
# mrf_search_stage = mrf_search_stage[~duplicates]
# 
# # Drop the 'sort_order' column as it's no longer needed
# mrf_search_stage = mrf_search_stage.drop(columns='sort_order')

outlier_rates = pd.read_csv('outlier_drg_rates.gz', compression='gzip', low_memory=False)

outlier_rates = outlier_rates[['billing_code_type', 'billing_code', 'negotiated_type', 'rec_lb_outlier', 'rec_ub_outlier']]
mrf_search_stage = mrf_search_stage.merge(outlier_rates, on=['billing_code', 'billing_code_type', 'negotiated_type'], how='left')
mrf_search_stage.rename(columns={'negotiated_rate':'negotiated_rate_cal'}, inplace=True)

def remove_outliers(row, column):
    if row['rec_ub_outlier'] < row[column]:
        return 0
    elif row['rec_lb_outlier'] > row[column]:
        return 0
    else:
        return row[column]

columns_to_update = ['negotiated_rate_cal']

for column in columns_to_update:
    mrf_search_stage[column] = mrf_search_stage.apply(remove_outliers, args=(column,), axis=1)

mrf_search_stage = mrf_search_stage[mrf_search_stage['negotiated_rate_cal'] != 0]
mrf_search_stage.drop(columns=['rec_lb_outlier', 'rec_ub_outlier'], inplace=True)
mrf_search_stage.replace('percentage','negotiated', inplace=True)

mrf_search_stage['source'] = 'mrf_search_stage'
# mrf_search_stage = mrf_search_stage[mrf_search_stage['taxid'] == '742730328']
# mrf_search_stage = mrf_search_stage[mrf_search_stage['npi'] == '1124074273']
# mrf_search_stage

Unnamed: 0,npi,taxid,plan_group_alias,billing_code,billing_code_type,negotiated_type,negotiated_rate_cal,source
0,1003344334,813935393,cigna_national_oap,1,MS-DRG,fee schedule,463142.17,mrf_search_stage
1,1003344334,813935393,cigna_national_oap,1,MS-DRG,fee schedule,734886.93,mrf_search_stage
2,1003344334,813935393,uhc_choice_plus,1,MS-DRG,negotiated,658739.87,mrf_search_stage
3,1003344334,813935393,cigna_national_oap,1,MS-DRG,fee schedule,690797.51,mrf_search_stage
4,1003344334,813935393,uhc_choice_plus,1,MS-DRG,negotiated,823445.16,mrf_search_stage
...,...,...,...,...,...,...,...,...
478201,1992707228,752559845,cigna_national_oap,987,MS-DRG,fee schedule,36110.43,mrf_search_stage
478202,1992707228,752559845,bcbs_tx_ppo,988,MS-DRG,negotiated,19294.44,mrf_search_stage
478203,1992707228,752559845,cigna_national_oap,988,MS-DRG,fee schedule,18147.72,mrf_search_stage
478204,1992707228,752559845,bcbs_tx_ppo,989,MS-DRG,negotiated,12810.53,mrf_search_stage


In [10]:
inn_rates = pd.read_csv('in_network_rates_ip.gz', compression='gzip', low_memory=False, dtype={'taxid': str, 'npi': str})
inn_rates.sort_values(by=['npi','taxid','billing_code','billing_code_type'], inplace=True)

# Define a custom sort order
sort_order = {'negotiated': 1, 'fee schedule': 2, 'percentage': 3, 'per diem': 4}

# # Create a new column 'sort_order' based on the 'negotiated_type' column
# inn_rates['sort_order'] = inn_rates['negotiated_type'].map(sort_order)
# 
# # Find duplicates
# duplicates = inn_rates.duplicated(subset=['npi', 'taxid', 'plan_group_alias', 'billing_code', 'billing_code_type'], keep='first')
# 
# # Keep only non-duplicates
# inn_rates = inn_rates[~duplicates]
# 
# # Drop the 'sort_order' column as it's no longer needed
# inn_rates = inn_rates.drop(columns='sort_order')

inn_rates = inn_rates.merge(outlier_rates, on=['billing_code', 'billing_code_type', 'negotiated_type'], how='left')

def remove_outliers(row, column):
    if row['rec_ub_outlier'] < row[column] and  row['negotiated_type'] != 'per diem' and row['negotiated_type'] != 'percentage':
        return 0
    elif row['rec_lb_outlier'] > row[column] and  row['negotiated_type'] != 'per diem' and row['negotiated_type'] != 'percentage':
        return 0
    else:
        return row[column]

columns_to_update = ['negotiated_rate_cal']

for column in columns_to_update:
    inn_rates[column] = inn_rates.apply(remove_outliers, args=(column,), axis=1)

inn_rates = inn_rates[inn_rates['negotiated_rate_cal'] != 0]
inn_rates.drop(columns=['rec_lb_outlier', 'rec_ub_outlier'], inplace=True)

inn_rates['source'] = 'mrf_in_network_rates'

# inn_rates = inn_rates[inn_rates['taxid'] == '753175630']
# inn_rates = inn_rates[inn_rates['npi'] == '1285798918']
inn_rates

Unnamed: 0,npi,taxid,plan_group_alias,billing_code,billing_code_type,negotiated_type,negotiated_rate_cal,source
0,1003215401,756001354,bcbs_tx_ppo,1,MS-DRG,percentage,35.00,mrf_in_network_rates
1,1003215401,756001354,bcbs_tx_ppo,1,MS-DRG,percentage,35.00,mrf_in_network_rates
2,1003215401,756001354,bcbs_tx_ppo,1,MS-DRG,percentage,35.00,mrf_in_network_rates
3,1003215401,756001354,bcbs_tx_ppo,1,MS-DRG,percentage,35.00,mrf_in_network_rates
4,1003215401,756001354,bcbs_tx_ppo,1,MS-DRG,per diem,925.00,mrf_in_network_rates
...,...,...,...,...,...,...,...,...
685057,1992747125,741152597,uhc_choice_plus,988,MS-DRG,negotiated,36344.65,mrf_in_network_rates
685058,1992747125,741152597,bcbs_tx_ppo,989,MS-DRG,negotiated,21787.73,mrf_in_network_rates
685059,1992747125,741152597,bcbs_tx_ppo,989,MS-DRG,per diem,1833.00,mrf_in_network_rates
685060,1992747125,741152597,uhc_choice_plus,989,MS-DRG,negotiated,23136.79,mrf_in_network_rates


In [11]:
# MCDallas = pd.read_csv('ProviderFiles/62-1682198_medical-city-dallas_standardcharges.csv', dtype={'taxid': str, 'npi': str})
# MCDallas = MCDallas[MCDallas['billing_code_type'] == 'DRG']
# CHCACL = pd.read_csv('ProviderFiles/621801360_hca-houston-clear-lake_standardcharges.csv', dtype={'taxid': str, 'npi': str})
# CHCACL = CHCACL[CHCACL['billing_code_type'] == 'DRG']
DellChildrens = pd.read_csv('ProviderFiles/dellchildrens_ip.csv', dtype={'taxid': str, 'npi': str})
Dell = pd.read_csv('ProviderFiles/dell_ip.csv', dtype={'taxid': str, 'npi': str})
SetonAustin = pd.read_csv('ProviderFiles/setonaustin_ip.csv', dtype={'taxid': str, 'npi': str})
CHRISTUSChildrens = pd.read_csv('ProviderFiles/CHRISTUSChi.csv', dtype={'taxid': str, 'npi': str})
# Hendrick = pd.read_csv('ProviderFiles/hendrick_ip.csv', dtype={'taxid': str, 'npi': str})
# MethodistHosp = pd.read_csv('ProviderFiles/MethodistHosp_IP.csv', dtype={'taxid': str, 'npi': str})

df_provider = pd.concat([DellChildrens, Dell, SetonAustin, CHRISTUSChildrens])

# # Define a custom sort order
# sort_order = {'negotiated': 1, 'fee schedule': 2, 'percentage': 3, 'per diem': 4}
# 
# # Create a new column 'sort_order' based on the 'negotiated_type' column
# df_provider['sort_order'] = df_provider['negotiated_type'].map(sort_order)
# 
# # Find duplicates
# duplicates = df_provider.duplicated(subset=['npi', 'taxid', 'plan_group_alias', 'billing_code', 'billing_code_type'], keep='first')
# 
# # Keep only non-duplicates
# df_provider = df_provider[~duplicates]
# 
# # Drop the 'sort_order' column as it's no longer needed
# df_provider = df_provider.drop(columns='sort_order')

# df_provider = df_provider.merge(outlier_rates, on=['billing_code', 'billing_code_type', 'negotiated_type'], how='left')
# 
# def remove_outliers(row, column):
#     if row['pct_80_claim_rate'] < row[column]:
#         return 0
#     elif row['pct_20_claim_rate'] > row[column]:
#         return 0
#     else:
#         return row[column]
# 
# columns_to_update = ['negotiated_rate_cal']
# 
# for column in columns_to_update:
#     df_provider[column] = df_provider.apply(remove_outliers, args=(column,), axis=1)

# df_provider = df_provider[df_provider['negotiated_rate_cal'] != 0]
# df_provider.drop(columns=['pct_20_claim_rate', 'pct_80_claim_rate'], inplace=True)

df_provider['source'] = 'ProviderFiles'
df_provider

Unnamed: 0,npi,taxid,plan_group_alias,billing_code,billing_code_type,negotiated_type,negotiated_rate_cal,source
0,1447355771,741109643,bcbs_tx_ppo,1,DRG,negotiated,220590.00,ProviderFiles
1,1447355771,741109643,bcbs_tx_ppo,1,DRG,percentage,67.00,ProviderFiles
2,1447355771,741109643,uhc_choice_plus,1,DRG,negotiated,147468.00,ProviderFiles
3,1447355771,741109643,bcbs_tx_ppo,2,DRG,negotiated,220590.00,ProviderFiles
4,1447355771,741109643,bcbs_tx_ppo,2,DRG,percentage,67.00,ProviderFiles
...,...,...,...,...,...,...,...,...
1250,1821004151,741109665,uhc_choice_plus,833,DRG,negotiated,7429.17,ProviderFiles
1251,1821004151,741109665,uhc_choice_plus,787,DRG,negotiated,14990.81,ProviderFiles
1252,1821004151,741109665,uhc_choice_plus,99211,CPT,negotiated,59.19,ProviderFiles
1253,1821004151,741109665,uhc_choice_plus,93303,CPT,negotiated,1373.79,ProviderFiles


In [12]:
df_rates = pd.concat([mrf_search_stage,inn_rates])
df_rates = df_rates.query("not (taxid.isin(['203749695', '260194016', '274434451', '412101361', '741161944', '741166904', '751008430', '751037591', '751777119', '751837454', '752586857', '752834135', '820551704']) and plan_group_alias == 'uhc_choice_plus')")
df_rates = pd.concat([df_rates, df_provider])


# Define a custom sort order for 'source' column
# Adjust this dictionary according to your specific needs
source_order = {'mrf_search_stage': 1, 'mrf_in_network_rates': 2, 'ProviderFiles': 3}

# Create new columns 'negotiated_type_sort_order' and 'source_sort_order' 
# based on the 'negotiated_type' and 'source' columns
df_rates['source_sort_order'] = df_rates['source'].map(source_order)

# Sort the DataFrame based on 'negotiated_type_sort_order' and 'source_sort_order'
df_rates.sort_values(['source_sort_order'], inplace=True)

# Drop duplicates based on 'npi', 'tin_value', 'plan', 'code', 'code_type', 'negotiated_type'
df_rates.drop_duplicates(subset=['npi', 'taxid', 'plan_group_alias', 'billing_code', 'billing_code_type', 'negotiated_type'], keep='first', inplace=True)

df_rates.sort_values(by=['npi','taxid','billing_code','billing_code_type'], inplace=True)
df_rates

Unnamed: 0,npi,taxid,plan_group_alias,billing_code,billing_code_type,negotiated_type,negotiated_rate_cal,source,source_sort_order
0,1003215401,756001354,bcbs_tx_ppo,1,MS-DRG,percentage,35.00,mrf_in_network_rates,2
4,1003215401,756001354,bcbs_tx_ppo,1,MS-DRG,per diem,925.00,mrf_in_network_rates,2
5,1003215401,756001354,uhc_choice_plus,1,MS-DRG,negotiated,306708.05,mrf_in_network_rates,2
7,1003215401,756001354,bcbs_tx_ppo,2,MS-DRG,percentage,35.00,mrf_in_network_rates,2
12,1003215401,756001354,uhc_choice_plus,2,MS-DRG,negotiated,146816.37,mrf_in_network_rates,2
...,...,...,...,...,...,...,...,...,...
685055,1992747125,741152597,bcbs_tx_ppo,988,MS-DRG,per diem,1833.00,mrf_in_network_rates,2
685056,1992747125,741152597,uhc_choice_plus,988,MS-DRG,negotiated,26651.39,mrf_in_network_rates,2
685058,1992747125,741152597,bcbs_tx_ppo,989,MS-DRG,negotiated,21787.73,mrf_in_network_rates,2
685059,1992747125,741152597,bcbs_tx_ppo,989,MS-DRG,per diem,1833.00,mrf_in_network_rates,2


In [13]:
match_0 = df_rates.pivot(index=['npi', 'taxid', 'billing_code', 'billing_code_type', 'negotiated_type'],
                columns=['plan_group_alias'],
                values=['negotiated_rate_cal']).reset_index()

match_0.columns =  [item[0] if item[1] == '' else item[1] for item in match_0.columns]
# del df_temp 
match_0


Unnamed: 0,npi,taxid,billing_code,billing_code_type,negotiated_type,aetna_open_access_managed,bcbs_tx_ppo,cigna_national_oap,uhc_choice_plus
0,1003215401,756001354,1,MS-DRG,negotiated,,,,306708.05
1,1003215401,756001354,1,MS-DRG,per diem,,925.00,,
2,1003215401,756001354,1,MS-DRG,percentage,,35.00,,
3,1003215401,756001354,2,MS-DRG,negotiated,,,,146816.37
4,1003215401,756001354,2,MS-DRG,per diem,,925.00,,
...,...,...,...,...,...,...,...,...,...
473930,1992747125,741152597,987,MS-DRG,per diem,,1833.00,,
473931,1992747125,741152597,988,MS-DRG,negotiated,,33088.80,,26651.39
473932,1992747125,741152597,988,MS-DRG,per diem,,1833.00,,
473933,1992747125,741152597,989,MS-DRG,negotiated,,21787.73,,23136.79


In [14]:
MemorialHermann = pd.read_csv('ProviderFiles/MemorialHermannTXMedCtr_IP.csv', dtype={'taxid': str, 'npi': str, 'aetna_open_access_managed': float, 'bcbs_tx_ppo': float, 'cigna_national_oap': float, 'uhc_choice_plus': float})
MemorialHermannSW = pd.read_csv('ProviderFiles/MemorialHermannSW_IP.csv', dtype={'taxid': str, 'npi': str, 'aetna_open_access_managed': float, 'bcbs_tx_ppo': float, 'cigna_national_oap': float, 'uhc_choice_plus': float})
MemorialHermannMC = pd.read_csv('ProviderFiles/MemorialHermannMemCity_IP.csv', dtype={'taxid': str, 'npi': str, 'aetna_open_access_managed': float, 'bcbs_tx_ppo': float, 'cigna_national_oap': float, 'uhc_choice_plus': float})
Laredo = pd.read_csv('ProviderFiles/laredo_ip.csv', dtype={'taxid': str, 'npi': str, 'aetna_open_access_managed': float, 'bcbs_tx_ppo': float, 'cigna_national_oap': float, 'uhc_choice_plus': float})
Longview = pd.read_csv('ProviderFiles/longview_ip.csv', dtype={'taxid': str, 'npi': str, 'aetna_open_access_managed': float, 'bcbs_tx_ppo': float, 'cigna_national_oap': float, 'uhc_choice_plus': float})
BSW = pd.read_csv('ProviderFiles/bsw_ip.csv', dtype={'taxid': str, 'npi': str, 'aetna_open_access_managed': float, 'bcbs_tx_ppo': float, 'cigna_national_oap': float, 'uhc_choice_plus': float})

match_0 = pd.concat([match_0, MemorialHermann, MemorialHermannSW, MemorialHermannMC, Laredo, Longview, BSW])
# match_0.drop_duplicates(columns=['npi','taxid','billing_code','billing_code_type','negotiated_type'], inplace=True)

In [16]:
df_claims_ip.drg_code = df_claims_ip.drg_code.astype(int)
match_0['billing_code'] = pd.to_numeric(match_0['billing_code'], errors='coerce')
match_0 = match_0.dropna(subset=['billing_code'])
match_0.billing_code = match_0.billing_code.astype(int)
match_1 = pd.merge(df_claims_ip, match_0, how='left',
                   left_on=[ 'drg_code','billingprovidertaxid', 'billingprovidernpi'],
                   right_on=['billing_code','taxid', 'npi'])
match_1 = match_1.fillna(0)


def calculate_percentage(row, column):
    if row['negotiated_type'] == 'percentage' and row[column] != 0:
        return round((row[column] / 100) * (row['amtcovered'] / row['claimcount']),2)
    else:
        return row[column]

columns_to_update = ['aetna_open_access_managed', 'bcbs_tx_ppo', 'cigna_national_oap', 'uhc_choice_plus']

for column in columns_to_update:
    match_1[column + '_derived'] = match_1.apply(calculate_percentage, args=(column,), axis=1)

def calculate_perdiem(row, column):
    if row['negotiated_type'] == 'per diem' and row[column] != 0:
        return round(row[column] * row['AvgLengthOfStay'],2)
    else:
        return row[column]

columns_to_update = ['aetna_open_access_managed_derived', 'bcbs_tx_ppo_derived', 'cigna_national_oap_derived', 'uhc_choice_plus_derived']

for column in columns_to_update:
    match_1[column] = match_1.apply(calculate_perdiem, args=(column,), axis=1)
# 
# outlier_rates = pd.read_csv('outlier_drg_rates.gz', compression='gzip', low_memory=False)
# outlier_rates.billing_code_type = outlier_rates.billing_code_type.replace('MS-DRG','DRG')
# outlier_rates = outlier_rates[['billing_code_type', 'billing_code', 'negotiated_type', 'rec_lb_outlier', 'rec_ub_outlier']]
# match_1 = match_1.merge(outlier_rates, on=['billing_code', 'billing_code_type', 'negotiated_type'], how='left')
# 
# 
# def remove_outliers(row, column):
#     if row['rec_ub_outlier'] < row[column] :
#         return 0
#     elif row['rec_lb_outlier'] > row[column] :
#         return 0
#     else:
#         return row[column]
# 
# columns_to_update = ['aetna_open_access_managed_derived', 'bcbs_tx_ppo_derived', 'cigna_national_oap_derived', 'uhc_choice_plus_derived']
# 
# for column in columns_to_update:
#     match_1[column] = match_1.apply(remove_outliers, args=(column,), axis=1)
# 
match_1.drop_duplicates(inplace=True)

match_1['negotiated_type'] = match_1['negotiated_type'].replace('fee schedule','negotiated')

def mean_positive(x):
    return x[x > 0].mean()

match_1 = match_1.groupby(['billingprovidername', 'billingprovidertaxid','billingprovidernpi', 'billingproviderid', 'cbsa_name', 'drg_code', 'IDN', 'rank', 'negotiated_type']).agg({'amtcovered':mean_positive, 'amtallowed': mean_positive,'aetna_open_access_managed': mean_positive, 'bcbs_tx_ppo': mean_positive, 'cigna_national_oap': mean_positive, 'uhc_choice_plus': mean_positive, 'aetna_open_access_managed_derived': mean_positive, 'bcbs_tx_ppo_derived': mean_positive, 'cigna_national_oap_derived': mean_positive, 'uhc_choice_plus_derived': mean_positive}, ).reset_index().sort_values('amtallowed', ascending = False)
match_1 = match_1.fillna(0)

match_1 = match_1[match_1['rank'] == 12]

# match_1 = match_1[match_1['billingprovidertaxid'] == '741109643']
# match_1 = match_1[match_1['billingprovidernpi'] == '1447355771']
match_1

Unnamed: 0,billingprovidername,billingprovidertaxid,billingprovidernpi,billingproviderid,cbsa_name,drg_code,IDN,rank,negotiated_type,amtcovered,amtallowed,aetna_open_access_managed,bcbs_tx_ppo,cigna_national_oap,uhc_choice_plus,aetna_open_access_managed_derived,bcbs_tx_ppo_derived,cigna_national_oap_derived,uhc_choice_plus_derived
17221,COOK CHILDREN'S MEDICAL CENTER,752051646,1891765178,000H0HH019401,"Dallas-Fort Worth-Arlington, TX",790,Cook Childrens Medical Ctr and Cook Childrens ...,12,percentage,14349711.08,11517358.10,0.0,84.0,0.0,0.0,0.0,388830.88,0.0,0.0
17235,COOK CHILDREN'S MEDICAL CENTER,752051646,1891765178,000H0HH019401,"Dallas-Fort Worth-Arlington, TX",834,Cook Childrens Medical Ctr and Cook Childrens ...,12,percentage,11084591.15,9276326.59,0.0,84.0,0.0,0.0,0.0,846459.69,0.0,0.0
17082,COOK CHILDREN'S MEDICAL CENTER,752051646,1891765178,000H0HH019401,"Dallas-Fort Worth-Arlington, TX",189,Cook Childrens Medical Ctr and Cook Childrens ...,12,percentage,5994288.35,4918130.77,0.0,84.0,0.0,0.0,0.0,55946.69,0.0,0.0
17098,COOK CHILDREN'S MEDICAL CENTER,752051646,1891765178,000H0HH019401,"Dallas-Fort Worth-Arlington, TX",228,Cook Childrens Medical Ctr and Cook Childrens ...,12,percentage,5933511.95,4906096.43,0.0,84.0,0.0,0.0,0.0,553794.45,0.0,0.0
17224,COOK CHILDREN'S MEDICAL CENTER,752051646,1891765178,000H0HH019401,"Dallas-Fort Worth-Arlington, TX",793,Cook Childrens Medical Ctr and Cook Childrens ...,12,percentage,4605004.24,3651266.03,0.0,84.0,0.0,0.0,0.0,133386.33,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17299,COOK CHILDREN'S MEDICAL CENTER - PROSPER,852354189,1215609896,000H0HH333D01,"Dallas-Fort Worth-Arlington, TX",395,Cook Childrens Medical Ctr and Cook Childrens ...,12,percentage,14645.05,12301.84,0.0,84.0,0.0,0.0,0.0,12301.84,0.0,0.0
17162,COOK CHILDREN'S MEDICAL CENTER,752051646,1891765178,000H0HH019401,"Dallas-Fort Worth-Arlington, TX",479,Cook Childrens Medical Ctr and Cook Childrens ...,12,percentage,108411.10,2530.65,0.0,84.0,0.0,0.0,0.0,91065.32,0.0,0.0
17307,COOK CHILDRENS HOME HEALTH,752896983,1336119619,000H0HH963101,"Dallas-Fort Worth-Arlington, TX",556,Cook Childrens Medical Ctr and Cook Childrens ...,12,0,765.00,650.25,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
17309,COOK CHILDRENS HOME HEALTH,752896983,1336119619,000H0HH963101,"Dallas-Fort Worth-Arlington, TX",951,Cook Childrens Medical Ctr and Cook Childrens ...,12,0,255.00,216.75,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0


In [18]:
rank = df_claims_ip[df_claims_ip['rank'] == 12]
allowed = rank['amtallowed'].sum()
match_1['share'] = match_1['amtallowed'] / allowed

match_1['bcbstx_match'] = match_1['bcbs_tx_ppo_derived'] / match_1['bcbs_tx_ppo_derived'] * match_1['share']
match_1['uhc_match'] = match_1['uhc_choice_plus_derived'] / match_1['bcbs_tx_ppo_derived'] * match_1['share']
match_1['aetna_match'] = match_1['aetna_open_access_managed_derived'] / match_1['bcbs_tx_ppo_derived'] * match_1['share']
match_1['cigna_match'] = match_1['cigna_national_oap_derived'] / match_1['bcbs_tx_ppo_derived'] * match_1['share']

match_1 = match_1.fillna(0).replace([np.inf,-np.inf],0)
# match_1.to_csv('Data Sources/MatchedRates_ip.gz',index=False, compression='gzip')
# match_1

In [199]:
print('Any data',1-match_1.loc[match_1.billingprovidertaxid == 0,'share'].sum())

cond_1 =(match_1.bcbs_tx_ppo_derived != 0) & (match_1.uhc_choice_plus_derived != 0)
cond_2 = (match_1.bcbs_tx_ppo_derived != 0) & (match_1.aetna_open_access_managed_derived != 0)
cond_3 = (match_1.bcbs_tx_ppo_derived != 0) & (match_1.cigna_national_oap_derived != 0)
print('BCBS/UHC share',match_1.loc[cond_1,'share'].sum())
print('BCBS/Aetna share',match_1.loc[cond_2,'share'].sum())
print('BCBS/Cigna share',match_1.loc[cond_3,'share'].sum())

del cond_1
del cond_2
del cond_3


Any data 1.0
BCBS/UHC share 0.5112970910536034
BCBS/Aetna share 0.23709125149480464
BCBS/Cigna share 0.5755715866323228


In [200]:
 cond_uhc = match_1[(match_1.bcbstx_match != 0) & (match_1.uhc_match != 0)]
 cond_cigna = match_1[(match_1.bcbstx_match != 0) & (match_1.cigna_match != 0)]
# columns = ['billingprovidertaxid','share', 'bcbstx_matched', 'uhc_matched']
# grouped = ['billingprovidertaxid']

# a = match_1.loc[cond,columns].groupby(grouped).sum()
# a1 = dict(zip(a.index,a.share))
# a2 = dict(zip(a.index,a.uhc_match))
# 
# weights_prov['UHC_share'] = weights_prov['billingprovidertaxid'].map(a1)
# weights_prov['UHC_share'] = weights_prov['UHC_share'].fillna(0)
# weights_prov['UHC_weigthed'] = weights_prov['billingprovidertaxid'].map(a2)
# weights_prov['avail_share'] = weights_prov['UHC_share'] / weights_prov['share']
# weights_prov['matched_share'] = weights_prov['avail_share'] * weights_prov['share']
# weights_prov['avail_cum_share'] = weights_prov['UHC_share'].cumsum()
# weights_prov['UHC_factor'] = weights_prov['UHC_weigthed'] / weights_prov['UHC_share']
# weights_prov['UHC_factorX'] = weights_prov['UHC_factor'] * weights_prov['matched_share']
# weights_prov['npi'] = weights_prov['billingprovidertaxid'].astype(int)
# weights_prov['sharexfctr'] = weights_prov['matched_share'] * weights_prov['UHC_factor']
# 
# final_prov = weights_prov.loc[:,['billingprovidertaxid','allowed', 'share','avail_share','matched_share','UHC_factor', 'sharexfctr']]
# del cond, columns, grouped, a, a1, a2
# total = final_prov.sharexfctr.sum() / final_prov.matched_share.sum()
# print(total)
# final_prov
total_uhc = cond_uhc.bcbstx_match.sum() / cond_uhc.uhc_match.sum()
total_cigna = cond_cigna.bcbstx_match.sum() / cond_cigna.cigna_match.sum()
print(total_uhc)
print(total_cigna)

0.9150747722624398
0.851820617874312


In [13]:
df_rates = pd.read_csv('base_rates_ip.gz', compression='gzip', low_memory=False, dtype={'tin_value': str, 'npi': str})

if df_rates['billing_code_modifier'].dropna().unique().size > 0 :


    # Columns to analyse
    df_mod = df_rates.loc[:,['plan', 'code', 'code_type','billing_code_modifier','negotiated_rate']]
    df_mod['billing_code_modifier'] = df_mod['billing_code_modifier'].fillna('')
    df_mod['negotiated_rate'] = df_mod['negotiated_rate'].fillna(0)

    # Groups
    mod_groups = df_mod.groupby(['plan', 'code', 'code_type','billing_code_modifier'])

    # Calculations
    mod_calcs = mod_groups.agg(['mean','std','count'])


    # Pivot
    mod_piv = mod_calcs.reset_index()
    mod_piv.sort_index(axis=1, inplace=True)
    mod_piv = mod_piv.pivot_table(index=['plan', 'code', 'code_type'],
                                                columns='billing_code_modifier',
                                                values=[('negotiated_rate','mean'),('negotiated_rate','std'),('negotiated_rate','count')])

    # BASE RATE
    is_mod = mod_piv.xs(key='',level=2, axis=1).reset_index().sort_index()
    is_valid = mod_piv.reset_index().xs(key='',level=2, axis=1)['negotiated_rate']['count'] > 0
    mode_base = is_mod.loc[ is_valid , :].sort_index()
    mode_base.sort_index(axis=1, inplace=True)

    from scipy import stats

    def perform_t_test(row):
        mean1, std1, count1 = row[('negotiated_rate_mod','mean')],row[('negotiated_rate_mod','std')],row[('negotiated_rate_mod','count')]
        mean2, std2, count2 = row[('negotiated_rate_base','mean')],row[('negotiated_rate_base','std')],row[('negotiated_rate_base','count')]
        
        if np.sqrt((std1**2/count1)+(std2**2/count2)) == 0:
            t_stat = 0
        else:
            t_stat = (mean1-mean2) / np.sqrt((std1**2/count1)+(std2**2/count2))
        
        try:
            df = ((std1**2/count1 + std2**2/count2)**2) / (((std1**2/count1)**2 / (count1-1)) + ((std2**2/count2)**2 / (count2-1)))
        except: 
            df = 0

        # Calculate the p-value
        p_value = stats.t.sf(np.abs(t_stat), df) * 2  # Multiply by 2 for a two-tailed test
        return p_value


    modifiers = ['26','50','NU','RR','TC','UE']
    to_exclude = pd.DataFrame()

    for modif in modifiers:
        is_mod = mod_piv.xs(key=modif, level=2, axis=1).reset_index()
        is_valid = mod_piv.reset_index().xs(key=modif ,level=2, axis=1)['negotiated_rate']['count'] > 0
        temp = is_mod.loc[ is_valid , :].sort_index()

        temp.sort_index(axis=1, inplace=True)
        test = pd.merge(temp,mode_base,on=['plan','code','code_type'],how='left',suffixes=('_mod', '_base'))

        if 'std' not in test['negotiated_rate_mod'].columns:
        
            test['p-value'] = 0
            to_exclude_temp = test.copy()
            to_exclude_temp['billing_code_modifier'] = modif
            to_exclude_temp = to_exclude_temp.loc[:,['plan','code','code_type','billing_code_modifier']]
            
        else:
            test['p-value'] = test.apply(perform_t_test, axis=1)
            to_exclude_temp = test[test['p-value'] < 0.05].copy()
            to_exclude_temp['billing_code_modifier'] = modif
            to_exclude_temp = to_exclude_temp.loc[:,['plan','code','code_type','billing_code_modifier']]

            to_exclude = pd.concat([to_exclude_temp,to_exclude])

    # cond = (df_rates.code == '74177' ) & (df_rates.plan == 'uhc_option_ppo') 
    # a = df_rates.loc[cond,['plan', 'code', 'code_type','billing_code_modifier','negotiated_rate']]

    # # test [ test['negotiated_rate_mod']['count'] < 2 ]
    # test [ test['code'] == '74177' ]
        
    to_exclude.columns = [item[0] for item in to_exclude.columns]
    to_exclude['to_drop'] = True

    del is_mod, is_valid, temp, test, to_exclude_temp, modif


    df_rates_0 = pd.merge(df_rates,to_exclude,
                        on=['plan', 'code', 'code_type', 'billing_code_modifier'],
                        how='left')

    del mod_calcs, mod_groups, mod_piv, modifiers, mode_base, df_mod, to_exclude



    df_rates = df_rates_0[df_rates_0.to_drop != True]
    # df_rates = df_rates.drop(['to_drop','negotiated_rate_int','additional_information','billing_code_modifier','log_rate','billing_class'],axis=1)
    df_rates = df_rates.drop(['to_drop'],axis=1)


    del df_rates_0    

df_rates = df_rates.drop(['billing_code_modifier'],axis=1)    
df_rates['negotiated_type'] = df_rates['negotiated_type'].replace('fee schedule','negotiated')
df_rates

Unnamed: 0,npi,tin_value,plan,code,code_type,negotiated_type,negotiated_rate
0,1003192311,452750258,bcbs_tx_ppo,557,MS-DRG,negotiated,8384.65
1,1003192311,452750258,bcbs_tx_ppo,207,MS-DRG,per diem,585.00
2,1003192311,452750258,bcbs_tx_ppo,92,MS-DRG,per diem,585.00
3,1003192311,452750258,bcbs_tx_ppo,274,MS-DRG,per diem,585.00
4,1003192311,452750258,bcbs_tx_ppo,30,MS-DRG,per diem,585.00
...,...,...,...,...,...,...,...
1355198,1992796239,742557820,bcbs_tx_ppo,574,MS-DRG,negotiated,17141.40
1355199,1992796239,742557820,bcbs_tx_ppo,778,MS-DRG,per diem,1340.00
1355200,1992796239,742557820,bcbs_tx_ppo,802,MS-DRG,per diem,1340.00
1355201,1992796239,742557820,bcbs_tx_ppo,776,MS-DRG,per diem,1340.00


In [14]:
df_temp = df_rates.groupby(['npi','tin_value', 'plan', 'code', 'code_type','negotiated_type']).agg(['mean','count']).reset_index()
df_temp.columns = [item[0] if item[1] == '' else item[1] for item in df_temp.columns]

df_temp.code_type = df_temp.code_type.replace('MS-DRG','DRG')
df_temp = df_temp.loc[:,['npi', 'tin_value', 'plan', 'code', 'code_type', 'negotiated_type', 'mean', 'count']]

df_temp['mean'] = pd.to_numeric(df_temp['mean'])

df_temp = df_temp.groupby(['npi', 'tin_value','plan','code','code_type', 'negotiated_type']).agg({'count':'sum','mean':'mean'}).reset_index()

df_temp


Unnamed: 0,npi,tin_value,plan,code,code_type,negotiated_type,count,mean
0,1003192311,452750258,bcbs_tx_ppo,1,DRG,per diem,2,585.00
1,1003192311,452750258,bcbs_tx_ppo,1,DRG,percentage,7,50.00
2,1003192311,452750258,bcbs_tx_ppo,2,DRG,per diem,2,585.00
3,1003192311,452750258,bcbs_tx_ppo,2,DRG,percentage,7,50.00
4,1003192311,452750258,bcbs_tx_ppo,3,DRG,negotiated,1,108984.93
...,...,...,...,...,...,...,...,...
814738,1992796239,742557820,bcbs_tx_ppo,987,DRG,per diem,1,1340.00
814739,1992796239,742557820,bcbs_tx_ppo,988,DRG,negotiated,1,10302.60
814740,1992796239,742557820,bcbs_tx_ppo,988,DRG,per diem,1,1340.00
814741,1992796239,742557820,bcbs_tx_ppo,989,DRG,negotiated,1,6338.40


In [15]:
add_providers_data = True

In [16]:
# Baylor Univ -  TAX ID 751837454
if add_providers_data:
   
    provider_file = pd.read_csv(r'C:\Users\geoff.joe\PycharmProjects\Data-Services\PlanOptix - Analysis\Texas BCBSvsUHC\ProviderFiles\75-1837454_BAYLOR_UNIVERSITY_MEDICAL_CENTER_standardcharges.csv', encoding='latin1')

    provider_file = provider_file[['Patient Type', 'DRG', 'BCBS PPO', 'Aetna', 'Cigna', 'United']]
    # provider_file = provider_file.rename(columns={'code':'code_type'})
    # provider_file = provider_file.rename(columns={'id':'code','':''})
    provider_file['npi'] = 1447250253
    provider_file['tin_value'] = 751837454
    
    provider_file = provider_file.rename(columns={'United':'uhc_choice_plus','BCBS PPO':'bcbs_tx_ppo', 'Aetna': 'aetna_open_access_managed', 'Cigna': 'cigna_national_oap', 'DRG':'code'})
    provider_file['code_type'] = 'DRG'
    provider_file['negotiated_type'] = 'provider'
    provider_file = provider_file[provider_file['Patient Type'] == 'Inpatient']
    provider_file = provider_file.dropna()
    bcbstx = provider_file[provider_file['bcbs_tx_ppo'] != '**']
    uhc = provider_file[provider_file['uhc_choice_plus'] != '**']
    aetna = provider_file[provider_file['aetna_open_access_managed'] != '**']
    cigna = provider_file[provider_file['cigna_national_oap'] != '**']
    provider_file = pd.concat([bcbstx,uhc,aetna,cigna])


    provider_file = provider_file.melt(id_vars=['npi','tin_value','code','code_type', 'negotiated_type'],
                                    value_vars=['uhc_choice_plus','bcbs_tx_ppo', 'aetna_open_access_managed', 'cigna_national_oap'],
                                    var_name='plan',
                                    value_name='mean')

    provider_file = provider_file[provider_file['mean'] != '**']

    provider_file['count'] = 1
    provider_file['count'] = provider_file['count'].astype(int)
    provider_file['mean'] = provider_file['mean'].astype(float)
    provider_file['code'] = provider_file['code'].astype(int)
    # df_temp = df_temp[df_temp.tin_value != 741837454]
    # provider_file = provider_file.loc[:,df_temp.columns]
    # provider_file.loc[provider_file.code_type == 'DRG', 'code'] = provider_file.loc[provider_file.code_type == 'DRG', 'code'].astype(int).astype(str)    


provider_file

  provider_file = pd.read_csv(r'C:\Users\geoff.joe\PycharmProjects\Data-Services\PlanOptix - Analysis\Texas BCBSvsUHC\ProviderFiles\75-1837454_BAYLOR_UNIVERSITY_MEDICAL_CENTER_standardcharges.csv', encoding='latin1')


Unnamed: 0,npi,tin_value,code,code_type,negotiated_type,plan,mean,count
0,1447250253,751837454,3,DRG,provider,uhc_choice_plus,294693.0,1
1,1447250253,751837454,4,DRG,provider,uhc_choice_plus,199961.0,1
2,1447250253,751837454,11,DRG,provider,uhc_choice_plus,75216.0,1
3,1447250253,751837454,12,DRG,provider,uhc_choice_plus,56930.0,1
4,1447250253,751837454,13,DRG,provider,uhc_choice_plus,41186.0,1
...,...,...,...,...,...,...,...,...
11723,1447250253,751837454,982,DRG,provider,cigna_national_oap,23883.0,1
11724,1447250253,751837454,983,DRG,provider,cigna_national_oap,15904.0,1
11725,1447250253,751837454,987,DRG,provider,cigna_national_oap,31598.0,1
11726,1447250253,751837454,988,DRG,provider,cigna_national_oap,16153.0,1


In [17]:
df_temp = pd.concat([df_temp, provider_file])
df_temp['mean'] = pd.to_numeric(df_temp['mean'], errors='coerce')
df_temp = df_temp.groupby(['npi', 'tin_value','plan','code','code_type', 'negotiated_type']).agg({'count':'sum','mean':'mean'}).reset_index()