In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = None

In [2]:
# As we dont have all the columns in all files, we are setting required columns for the analysis and filter only those columns 
# from the files.
required_colmuns = ['BusinessYear', 'StateCode', 'IssuerId', 'StandardComponentId', 'PlanId', 'BenefitName', 'CopayInnTier1',
       'CopayInnTier2', 'CopayOutofNet', 'CoinsInnTier1', 'CoinsInnTier2',
       'CoinsOutofNet', 'IsEHB', 'IsCovered',
       'QuantLimitOnSvc','EHBVarReason']

In [3]:
Benefit_2014 = pd.read_csv('Exchange Data/2014/Benefits_Cost_Sharing_PUF.csv',low_memory=False)
Benefit_2014 = Benefit_2014[required_colmuns]
Benefit_2015 = pd.read_csv('Exchange Data/2015/Benefits_Cost_Sharing_PUF.csv',low_memory=False)
Benefit_2015 = Benefit_2015[required_colmuns]
Benefit_2016 = pd.read_csv('Exchange Data/2016/Benefits_Cost_Sharing_PUF.csv',low_memory=False, encoding='latin1')
Benefit_2016 = Benefit_2016[required_colmuns]
Benefit_2017 = pd.read_csv('Exchange Data/2017/Benefits_Cost_Sharing_PUF.csv',low_memory=False, encoding='latin1')
Benefit_2017 = Benefit_2017[required_colmuns]
Benefit_2018 = pd.read_csv('Exchange Data/2018/Benefits_Cost_Sharing_PUF.csv',low_memory=False, encoding='latin1')
Benefit_2018 = Benefit_2018[required_colmuns]
Benefit_2019 = pd.read_csv('Exchange Data/2019/Benefits_Cost_Sharing_PUF.csv',low_memory=False, encoding='latin1')
Benefit_2019 = Benefit_2019[required_colmuns]

In [4]:
benefit_all = pd.concat([Benefit_2014,Benefit_2015,Benefit_2016,Benefit_2017,Benefit_2018,Benefit_2019])

In [5]:
#We can see from here that Benefits were decreased in years but in 2019 it increased again
benefit_all['BusinessYear'].value_counts()

2015    2079286
2016    1774255
2017    1324275
2014    1164869
2019     967050
2018     829652
Name: BusinessYear, dtype: int64

In [6]:
benefit_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8139387 entries, 0 to 967049
Data columns (total 16 columns):
BusinessYear           int64
StateCode              object
IssuerId               int64
StandardComponentId    object
PlanId                 object
BenefitName            object
CopayInnTier1          object
CopayInnTier2          object
CopayOutofNet          object
CoinsInnTier1          object
CoinsInnTier2          object
CoinsOutofNet          object
IsEHB                  object
IsCovered              object
QuantLimitOnSvc        object
EHBVarReason           object
dtypes: int64(2), object(14)
memory usage: 1.0+ GB


In [7]:
#Copay and Coninsurance are not numeric so are not the part of describe as well.
benefit_all.describe()

Unnamed: 0,BusinessYear,IssuerId
count,8139387.0,8139387.0
mean,2016.181,51203.76
std,1.560375,25958.59
min,2014.0,10046.0
25%,2015.0,31274.0
50%,2016.0,44965.0
75%,2017.0,74313.0
max,2019.0,99969.0


In [8]:
# check columns with number of null or empty values
benefit_all.isnull().sum()

BusinessYear                 0
StateCode                    0
IssuerId                     0
StandardComponentId          0
PlanId                       0
BenefitName                  0
CopayInnTier1          1802865
CopayInnTier2          7103129
CopayOutofNet          1802867
CoinsInnTier1          1802865
CoinsInnTier2          7103129
CoinsOutofNet          1802865
IsEHB                  2599564
IsCovered               330884
QuantLimitOnSvc        5493110
EHBVarReason           5250104
dtype: int64

In [9]:
# make sure that number number of recodrs and features are correct.
# Total number of rows and columns: total_rows, total_columns:
total_rows, total_columns = benefit_all.shape
print("Total number of records:", total_rows)
print("Total number of columns:", total_columns)

Total number of records: 8139387
Total number of columns: 16


In [10]:
print("IsEHB array  " )
print(benefit_all['IsEHB'].unique())
#IsEHB needs to fill Nan with No

IsEHB array  
[nan 'Yes']


In [11]:
benefit_all['IsEHB'] = benefit_all['IsEHB'].fillna('No')
benefit_all['IsEHB'] = benefit_all['IsEHB'].astype('category')

In [12]:
print("IsCovered array  " )
print(benefit_all['IsCovered'].unique())
#IsCovered has Not Covered and Not covered

IsCovered array  
['Covered' nan 'Not Covered' 'Not covered']


In [13]:
benefit_all['IsCovered'] = benefit_all['IsCovered'].fillna('Not Covered')
benefit_all['IsCovered'] = benefit_all['IsCovered'].str.replace('Not covered', 'Not Covered')
benefit_all['IsCovered'] = benefit_all['IsCovered'].astype('category')

In [14]:
#Cleaning Copay
#NA needs to be filled with 'Not Covered'.
benefit_all['CopayInnTier1'] = benefit_all['CopayInnTier1'].fillna('Not Covered')
benefit_all['CopayInnTier2'] = benefit_all['CopayInnTier2'].fillna('Not Covered')
benefit_all['CopayOutofNet'] = benefit_all['CopayOutofNet'].fillna('Not Covered')
#Replace 'No Charge to $0'
benefit_all['CopayInnTier1'] = benefit_all['CopayInnTier1'].str.replace('No Charge', '$0')
benefit_all['CopayInnTier1'] = benefit_all['CopayInnTier1'].str.replace('0%', '$0')
benefit_all['CopayInnTier2'] = benefit_all['CopayInnTier2'].str.replace('No Charge', '$0')
benefit_all['CopayInnTier2'] = benefit_all['CopayInnTier2'].str.replace('0%', '$0')
benefit_all['CopayOutofNet'] = benefit_all['CopayOutofNet'].str.replace('No Charge', '$0')
benefit_all['CopayOutofNet'] = benefit_all['CopayOutofNet'].str.replace('0%', '$0')

#Divide them in 2 columns 'Value and unit'
benefit_all['CopayInnTier1Value'] = benefit_all['CopayInnTier1'].str.extract('(\d+)')
benefit_all['CopayInnTier1Unit'] = benefit_all['CopayInnTier1'].str.split('(\d+)').str.get(-1)
benefit_all['CopayInnTier2Value'] = benefit_all['CopayInnTier2'].str.extract('(\d+)')
benefit_all['CopayInnTier2Unit'] = benefit_all['CopayInnTier2'].str.split('(\d+)').str.get(-1)
benefit_all['CopayOutofNetValue'] = benefit_all['CopayOutofNet'].str.extract('(\d+)')
benefit_all['CopayOutofNetUnit'] = benefit_all['CopayOutofNet'].str.split('(\d+)').str.get(-1)
#'after deductible' needs to be changed to 'Copay after deductible' for homogenity
benefit_all['CopayInnTier1Unit'] = benefit_all['CopayInnTier1Unit'].str.replace(' Copay after deductible', ' Copay after deductible1')
benefit_all['CopayInnTier1Unit'] = benefit_all['CopayInnTier1Unit'].str.replace('  after deductible', ' Copay after deductible1')
benefit_all['CopayInnTier1Unit'] = benefit_all['CopayInnTier1Unit'].str.replace(' Copay after deductible1', ' Copay after deductible')

benefit_all['CopayInnTier2Unit'] = benefit_all['CopayInnTier2Unit'].str.replace(' Copay after deductible', ' Copay after deductible1')
benefit_all['CopayInnTier2Unit'] = benefit_all['CopayInnTier2Unit'].str.replace('  after deductible', ' Copay after deductible1')
benefit_all['CopayInnTier2Unit'] = benefit_all['CopayInnTier2Unit'].str.replace(' Copay after deductible1', ' Copay after deductible')

benefit_all['CopayOutofNetValue'] = benefit_all['CopayOutofNetValue'].str.replace(' Copay after deductible', ' Copay after deductible1')
benefit_all['CopayOutofNetValue'] = benefit_all['CopayOutofNetValue'].str.replace('  after deductible', ' Copay after deductible1')
benefit_all['CopayOutofNetValue'] = benefit_all['CopayOutofNetValue'].str.replace(' Copay after deductible1', ' Copay after deductible')

#Change the type of the data 
benefit_all['CopayInnTier1Value'] = benefit_all['CopayInnTier1Value'].astype('float')
benefit_all['CopayInnTier2Value'] = benefit_all['CopayInnTier2Value'].astype('float')
benefit_all['CopayOutofNetValue'] = benefit_all['CopayOutofNetValue'].astype('float')
benefit_all['CopayInnTier1Unit'] = benefit_all['CopayInnTier1Unit'].astype('category')
benefit_all['CopayInnTier2Unit'] = benefit_all['CopayInnTier2Unit'].astype('category')
benefit_all['CopayOutofNetUnit'] = benefit_all['CopayOutofNetUnit'].astype('category')

#handle the space values
benefit_all['CopayInnTier1Value'] = benefit_all['CopayInnTier1Value'].fillna('-1')
benefit_all['CopayInnTier1Unit'] = benefit_all['CopayInnTier1Unit'].replace('', ' Copay after deductible')
benefit_all['CopayInnTier2Value'] = benefit_all['CopayInnTier2Value'].fillna('-1')
benefit_all['CopayInnTier2Unit'] = benefit_all['CopayInnTier2Unit'].replace('', ' Copay after deductible')
benefit_all['CopayOutofNetValue'] = benefit_all['CopayOutofNetValue'].fillna('-1')
benefit_all['CopayOutofNetUnit'] = benefit_all['CopayOutofNetUnit'].replace('', ' Copay after deductible')

In [15]:
#Cleaning Coinsurance
#NA needs to be filled with 'Not Covered'.
benefit_all['CoinsInnTier1'] = benefit_all['CoinsInnTier1'].fillna('Not Covered')
benefit_all['CoinsInnTier2'] = benefit_all['CoinsInnTier2'].fillna('Not Covered')
benefit_all['CopayOutofNet'] = benefit_all['CoinsOutofNet'].fillna('Not Covered')
#Replace 'No Charge to $0'
benefit_all['CoinsInnTier1'] = benefit_all['CoinsInnTier1'].str.replace('No Charge', '0%')
benefit_all['CoinsInnTier1'] = benefit_all['CoinsInnTier1'].str.replace('$0', '0%')
benefit_all['CoinsInnTier2'] = benefit_all['CoinsInnTier2'].str.replace('No Charge', '0%')
benefit_all['CoinsInnTier2'] = benefit_all['CoinsInnTier2'].str.replace('$0', '0%')
benefit_all['CopayOutofNet'] = benefit_all['CoinsOutofNet'].str.replace('No Charge', '0%')
benefit_all['CopayOutofNet'] = benefit_all['CoinsOutofNet'].str.replace('$0', '0%')

#'after deductible' needs to be changed to 'Coinsurance after deductible' for homogenity
benefit_all['CoinsInnTier1'] = benefit_all['CoinsInnTier1'].str.replace('0% after deductible', '0% Coinsurance after deductible')
benefit_all['CoinsInnTier2'] = benefit_all['CoinsInnTier2'].str.replace('0% after deductible', '0% Coinsurance after deductible')
benefit_all['CoinsOutofNet'] = benefit_all['CoinsOutofNet'].str.replace('0% after deductible', '0% Coinsurance after deductible')

#Divide them in 2 columns 'Value and unit'
benefit_all['CoinsInnTier1Value'] = benefit_all['CoinsInnTier1'].str.extract('(\d+)')
benefit_all['CoinsInnTier1Unit'] = benefit_all['CoinsInnTier1'].str.split('%').str.get(1)

benefit_all['CoinsInnTier2Value'] = benefit_all['CoinsInnTier2'].str.extract('(\d+)')
benefit_all['CoinsInnTier2Unit'] = benefit_all['CoinsInnTier2'].str.split('%').str.get(1)

benefit_all['CoinsOutofNetValue'] = benefit_all['CoinsOutofNet'].str.extract('(\d+)')
benefit_all['CoinsOutofNetUnit'] = benefit_all['CoinsOutofNet'].str.split('%').str.get(1)

#handle the space values
benefit_all['CoinsInnTier1Value'] = benefit_all['CoinsInnTier1Value'].fillna(-1)
benefit_all['CoinsInnTier2Value'] = benefit_all['CoinsInnTier2Value'].fillna(-1)
benefit_all['CoinsOutofNetValue'] = benefit_all['CoinsOutofNetValue'].fillna(-1)

benefit_all['CoinsInnTier1Unit'] = benefit_all['CoinsInnTier1Unit'].replace('', ' Coinsurance after deductible')
benefit_all['CoinsInnTier2Unit'] = benefit_all['CoinsInnTier2Unit'].replace('', ' Coinsurance after deductible')
benefit_all['CoinsOutofNetUnit'] = benefit_all['CoinsOutofNetUnit'].replace('', ' Coinsurance after deductible')

benefit_all['CoinsInnTier1Unit'] = benefit_all['CoinsInnTier1Unit'].fillna(' Coinsurance after deductible')
benefit_all['CoinsInnTier2Unit'] = benefit_all['CoinsInnTier2Unit'].fillna(' Coinsurance after deductible')
benefit_all['CoinsOutofNetUnit'] = benefit_all['CoinsOutofNetUnit'].fillna(' Coinsurance after deductible')

#Change the type of the data 
benefit_all['CoinsInnTier1Value'] = benefit_all['CoinsInnTier1Value'].astype('float')
benefit_all['CoinsInnTier2Value'] = benefit_all['CoinsInnTier2Value'].astype('float')
benefit_all['CoinsOutofNetValue'] = benefit_all['CoinsOutofNetValue'].astype('float')
benefit_all['CoinsInnTier1Unit'] = benefit_all['CoinsInnTier1Unit'].astype('category')
benefit_all['CoinsInnTier2Unit'] = benefit_all['CoinsInnTier2Unit'].astype('category')
benefit_all['CoinsOutofNetUnit'] = benefit_all['CoinsOutofNetUnit'].astype('category')

#Make the coinsurance as fraction from Percentage

benefit_all['CoinsInnTier1Value'] = benefit_all['CoinsInnTier1Value']/100
benefit_all['CoinsInnTier2Value'] = benefit_all['CoinsInnTier2Value']/100
benefit_all['CoinsOutofNetValue'] = benefit_all['CoinsOutofNetValue']/100

In [16]:
# Clean the QuantLimitOnSvc 
# make yes and no homeogenous respect to case
# Impute No to all spaces.
benefit_all['QuantLimitOnSvc'] = benefit_all['QuantLimitOnSvc'].str.replace('yes', 'Yes')
benefit_all['QuantLimitOnSvc'] = benefit_all['QuantLimitOnSvc'].str.replace('NO', 'No')
benefit_all['QuantLimitOnSvc'] = benefit_all['QuantLimitOnSvc'].str.replace('no', 'No')
benefit_all['QuantLimitOnSvc'] = benefit_all['QuantLimitOnSvc'].fillna('No')
benefit_all['QuantLimitOnSvc'] = benefit_all['QuantLimitOnSvc'].astype('category')

In [17]:
# clean EHB var reason 
#remove \n from the data and make it homogenous respect to Case
benefit_all['EHBVarReason'] = benefit_all['EHBVarReason'].str.replace('\nAdditional EHB Benefit', 'Additional EHB Benefit')
benefit_all['EHBVarReason'] = benefit_all['EHBVarReason'].str.replace('additional EHB Benefit', 'Additional EHB Benefit')
benefit_all['EHBVarReason'] = benefit_all['EHBVarReason'].str.replace('above EHB', 'Above EHB')
benefit_all['EHBVarReason'] = benefit_all['EHBVarReason'].str.replace('Above Ehb', 'Above EHB')
benefit_all['EHBVarReason'] = benefit_all['EHBVarReason'].str.replace('Substantially equal', 'Substantially Equal')
benefit_all['EHBVarReason'] = benefit_all['EHBVarReason'].fillna(' ')

In [18]:
benefit_all_clean = benefit_all.drop(['CoinsInnTier1', 'CoinsInnTier2', 'CoinsOutofNet','CopayInnTier1','CopayInnTier2',
              'CopayOutofNet','CoinsInnTier1Unit','CoinsInnTier2Unit','CoinsOutofNetUnit'], axis=1)

In [20]:
benefit_all_clean.to_csv('merged_benefits.csv')