In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = None

# PlanAttributes: 
Plan-level data on maximum out of pocket payments, deductibles, cost sharing, HSA eligibility, formulary ID, and other plan attributes.
I would be using the below columns of the plan attribute, there would be some derived columns also for which explanation is given at the time of derivation of the column.We would need Plan Attribute, and Rates dataset for this analysis. For further information on Plan attribute dataset columns refer to the link http://www.nber.org/cciio/marketplace-puf/2018/plan2018.pdf 



|Variable Name|Data Type|Varible defination|
|--------------|----------|-------------------|
|PlanId|Text|Seventeen-character alpha-numeric code that identifies an insurance plan’s cost sharing reduction (CSR) variant within HIOS|
|MetalLevel|Text|Metal level, or coverage category, of insurance plan based on its actuarial value|
|TEHBInnTier1IndividualMOOP|Text|The dollar amount of the tier 1 in network, individual out-of-pocket cost limit for medical and drug EHB benefits|
|TEHBInnTier2IndividualMOOP|Text|The dollar amount of the tier 2 in network, individual out-of-pocket cost limit for medical and drug EHB benefits|
|TEHBOutOfNetIndividualMOOP|Text|The dollar amount of the out of network, individual out-of-pocket cost limit for medical and drug EHB benefits|
|IssuerID|Text|Five-digit numeric code that identifies the issuer organization in the Health Insurance Oversight System (HIOS)|

In [2]:
required_colmuns = ['BusinessYear','StateCode','PlanId','MetalLevel','TEHBInnTier1IndividualMOOP', 'TEHBInnTier2IndividualMOOP', 
                    'TEHBOutOfNetIndividualMOOP','DentalOnlyPlan']

In [3]:
pa_2014 = pd.read_csv('Exchange Data/2014/Plan_Attributes_PUF.csv',low_memory=False)
pa_2014 = pa_2014[required_colmuns]
pa_2015 = pd.read_csv('Exchange Data/2015/Plan_Attributes_PUF.csv',low_memory=False)
pa_2015 = pa_2015[required_colmuns]
pa_2016 = pd.read_csv('Exchange Data/2016/Plan_Attributes_PUF.csv',low_memory=False, encoding='latin1')
pa_2016 = pa_2016[required_colmuns]
pa_2017 = pd.read_csv('Exchange Data/2017/Plan_Attributes_PUF.csv',low_memory=False, encoding='latin1')
pa_2017 = pa_2017[required_colmuns]
pa_2018 = pd.read_csv('Exchange Data/2018/Plan_Attributes_PUF.csv',low_memory=False, encoding='latin1')
pa_2018 = pa_2018[required_colmuns]
pa_2019 = pd.read_csv('Exchange Data/2019/Plan_Attributes_PUF.csv',low_memory=False, encoding='latin1')
pa_2019 = pa_2019[required_colmuns]

KeyError: "['TEHBInnTier1FamilyMOOP'] not in index"

In [None]:
pa_all = pd.concat([pa_2014,pa_2015,pa_2016,pa_2017,pa_2018,pa_2019])
pa_all.head()

In [None]:
pa_all.info()

In [None]:
pa_all['MetalLevel'].unique()

In [None]:
pa_all['TEHBInnTier1IndividualMOOP'].unique()
#need to handle 'Not Applicable' and NaN 
#Convert the column in numeric
#replace all NaN values to 0

In [None]:
pa_all['TEHBInnTier2IndividualMOOP'].unique()
#need to handle 'Not Applicable' and NaN 
#Convert the column in numeric
#replace all NaN values to 0

In [None]:
pa_all['TEHBOutOfNetIndividualMOOP'].unique()
#need to handle 'Not Applicable' and NaN 
#Convert the column in numeric
#replace all NaN values to 0

In [None]:
cols_to_num = ['TEHBInnTier1IndividualMOOP', 'TEHBInnTier2IndividualMOOP', 
               'TEHBOutOfNetIndividualMOOP']
pa_all[cols_to_num] = pa_all[cols_to_num].replace(
    {'\$':'', ',':'', 'Not Applicable':np.nan},regex = True)
pa_all[cols_to_num] = pa_all[cols_to_num].fillna(0)
pa_all[cols_to_num] = pa_all[cols_to_num].astype('float')

In [None]:
pa_all.info()

In [None]:
pa_all.describe()

In [None]:
pa_all.groupby(['MetalLevel','DentalOnlyPlan']).count()

In [None]:
# Looking at the above analysis, Dental Plan can be inferred by MetalLevel as 'High' or 'Low'
#So deleting the column 'DentalOnlyPlan' from the dataset
pa_all = pa_all.drop('DentalOnlyPlan',axis=1)

In [None]:
pa_all.to_csv('merged_plan_attribute.csv')