# Read healthcare.gov CSVs

This notebook reads the four CSV files located in `data/healthcare.gov` and displays the first five rows of each file.

In [2]:
# Read and display first five rows of the four healthcare.gov CSVs
from pathlib import Path
import pandas as pd
from IPython.display import display

base = Path("data/healthcare.gov")
files = {
    "benefits_and_cost_sharing": "benefits-and-cost-sharing-puf.csv",
    "plan_attributes": "Plan_Attributes_PUF.csv",
    "rate": "Rate_PUF.csv",
    "service_area": "service-area-puf.csv",
}

for key, fname in files.items():
    path = base / fname
    print(f"=== {fname} ===")
    if not path.exists():
        print(f"File not found: {path}\n")
        continue
    try:
        df = pd.read_csv(path, low_memory=False)
        display(df.head(5))
    except Exception as e:
        print(f"Error reading {path}: {e}\n")

# -------------------------------------------------
# Continued analysis specifically for Rate_PUF.csv
# -------------------------------------------------
rate_path = base / files['rate']
if rate_path.exists():
    try:
        print("\n=== Analysis: Rate_PUF.csv ===")
        df_rate = pd.read_csv(rate_path, low_memory=False)

        # Basic overview
        print("Shape:", df_rate.shape)
        print("\nFirst 5 rows:")
        display(df_rate.head(5))

        # Info (prints to stdout in notebooks)
        print("\nInfo:")
        df_rate.info()

        # Missing values (percent)
        print("\nMissing values (percent):")
        missing = (df_rate.isna().mean() * 100).sort_values(ascending=False)
        display(missing[missing > 0].head(20))

        # Numeric summary
        numcols = df_rate.select_dtypes(include='number').columns.tolist()
        if numcols:
            print("\nNumeric summary (describe):")
            display(df_rate[numcols].describe().T)
        else:
            print("\nNo numeric columns found.")

        # Categorical top values
        objcols = df_rate.select_dtypes(include='object').columns.tolist()
        if objcols:
            print("\nTop value counts for object columns (showing up to 10):")
            for col in objcols:
                vc = df_rate[col].value_counts(dropna=False)
                print(f"\nColumn: {col} — unique={vc.size}")
                display(vc.head(10))
        else:
            print("\nNo object (categorical) columns found.")

        # Try to find any 'rate'-like numeric columns and show top rows
        rate_like = [c for c in df_rate.columns if 'rate' in c.lower()]
        if rate_like:
            print(f"\nFound rate-like columns: {rate_like}")
            for c in rate_like:
                if c in numcols:
                    print(f"\nTop 5 rows by {c}:")
                    display(df_rate.nlargest(5, columns=c)[[c] + df_rate.columns[:4].tolist()])
                else:
                    print(f"\nColumn {c} found but it's not numeric; showing value counts:")
                    display(df_rate[c].value_counts(dropna=False).head(10))
        else:
            print("\nNo column name containing 'rate' found; showing top 3 numeric columns by max value instead:")
            if numcols:
                maxvals = {c: df_rate[c].max() for c in numcols}
                sorted_cols = sorted(maxvals, key=maxvals.get, reverse=True)[:3]
                for c in sorted_cols:
                    print(f"\nTop 5 by {c} (max={maxvals[c]}):")
                    display(df_rate.nlargest(5, columns=c)[[c]])

        # Sample rows with most numeric data present
        if numcols:
            numeric_nonnull = df_rate[numcols].notnull().sum(axis=1)
            print('\nSample 5 rows with highest count of non-null numeric values:')
            display(df_rate.loc[numeric_nonnull.nlargest(5).index].head())

    except Exception as e:
        print(f"Error analyzing {rate_path}: {e}")
else:
    print(f"Rate file not found at {rate_path}")


=== benefits-and-cost-sharing-puf.csv ===


Unnamed: 0,BusinessYear,StateCode,IssuerId,SourceName,ImportDate,StandardComponentId,PlanId,BenefitName,CopayInnTier1,CopayInnTier2,...,IsEHB,IsCovered,QuantLimitOnSvc,LimitQty,LimitUnit,Exclusions,Explanation,EHBVarReason,IsExclFromInnMOOP,IsExclFromOonMOOP
0,2025,AK,21989,HIOS,2024-08-29 01:02:15,21989AK0030001,21989AK0030001-00,Accidental Dental,,,...,,,,,,,,,,
1,2025,AK,21989,HIOS,2024-08-29 01:02:15,21989AK0030001,21989AK0030001-00,Basic Dental Care - Adult,Not Applicable,,...,,Covered,Yes,1100.0,Dollars per Year,,See policy for other limits,Not EHB,Yes,Yes
2,2025,AK,21989,HIOS,2024-08-29 01:02:15,21989AK0030001,21989AK0030001-00,Basic Dental Care - Child,Not Applicable,,...,Yes,Covered,,,,,See policy for limits,Additional EHB Benefit,No,No
3,2025,AK,21989,HIOS,2024-08-29 01:02:15,21989AK0030001,21989AK0030001-00,Dental Check-Up for Children,Not Applicable,,...,Yes,Covered,,,,,See policy for limits,Additional EHB Benefit,No,No
4,2025,AK,21989,HIOS,2024-08-29 01:02:15,21989AK0030001,21989AK0030001-00,Major Dental Care - Adult,Not Applicable,,...,,Covered,Yes,1100.0,Dollars per Year,,See policy for other limits,Not EHB,Yes,Yes


=== Plan_Attributes_PUF.csv ===


Unnamed: 0,BusinessYear,StateCode,IssuerId,IssuerMarketPlaceMarketingName,SourceName,ImportDate,MarketCoverage,DentalOnlyPlan,StandardComponentId,PlanMarketingName,...,TEHBDedOutOfNetFamilyPerPerson,TEHBDedOutOfNetFamilyPerGroup,TEHBDedCombInnOonIndividual,TEHBDedCombInnOonFamilyPerPerson,TEHBDedCombInnOonFamilyPerGroup,IsHSAEligible,HSAOrHRAEmployerContribution,HSAOrHRAEmployerContributionAmount,URLForSummaryofBenefitsCoverage,PlanBrochure
0,2025,AK,21989,Delta Dental of Alaska,HIOS,2024-08-29 01:02:15,Individual,Yes,21989AK0030001,Delta Dental Premier Plan,...,,,,,,,,,,https://www.deltadentalak.com/-/media/deltaden...
1,2025,AK,21989,Delta Dental of Alaska,HIOS,2024-08-29 01:02:15,Individual,Yes,21989AK0030001,Delta Dental Premier Plan,...,,,,,,,,,,https://www.deltadentalak.com/-/media/deltaden...
2,2025,AK,21989,Delta Dental of Alaska,HIOS,2024-08-29 01:02:15,Individual,Yes,21989AK0050001,Delta Dental PPO 1000 Plan,...,,,,,,,,,,https://www.deltadentalak.com/-/media/deltaden...
3,2025,AK,21989,Delta Dental of Alaska,HIOS,2024-08-29 01:02:15,Individual,Yes,21989AK0050001,Delta Dental PPO 1000 Plan,...,,,,,,,,,,https://www.deltadentalak.com/-/media/deltaden...
4,2025,AK,21989,Delta Dental of Alaska,HIOS,2024-08-29 01:02:15,Individual,Yes,21989AK0050002,Delta Dental PPO 1500 Plan,...,,,,,,,,,,https://www.deltadentalak.com/-/media/deltaden...


=== Rate_PUF.csv ===


Unnamed: 0,BusinessYear,StateCode,IssuerId,SourceName,ImportDate,RateEffectiveDate,RateExpirationDate,PlanId,RatingAreaId,Tobacco,Age,IndividualRate,IndividualTobaccoRate,Couple,PrimarySubscriberAndOneDependent,PrimarySubscriberAndTwoDependents,PrimarySubscriberAndThreeOrMoreDependents,CoupleAndOneDependent,CoupleAndTwoDependents,CoupleAndThreeOrMoreDependents
0,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,0-14,65.0,,,,,,,,
1,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,15,65.0,,,,,,,,
2,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,16,65.0,,,,,,,,
3,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,17,65.0,,,,,,,,
4,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,18,65.0,,,,,,,,


=== service-area-puf.csv ===


Unnamed: 0,BusinessYear,StateCode,IssuerId,SourceName,ImportDate,ServiceAreaId,ServiceAreaName,CoverEntireState,County,PartialCounty,ZipCodes,PartialCountyJustification,MarketCoverage,DentalOnlyPlan
0,2025,AK,21989,HIOS,2024-08-29 01:02:15,AKS001,Alaska Premier,Yes,,,,,SHOP (Small Group),Yes
1,2025,AK,21989,HIOS,2024-08-29 01:02:15,AKS002,Alaska PPO,No,2020.0,No,,,SHOP (Small Group),Yes
2,2025,AK,21989,HIOS,2024-08-29 01:02:15,AKS002,Alaska PPO,No,2090.0,No,,,SHOP (Small Group),Yes
3,2025,AK,21989,HIOS,2024-08-29 01:02:15,AKS002,Alaska PPO,No,2170.0,No,,,SHOP (Small Group),Yes
4,2025,AK,21989,HIOS,2024-08-29 01:02:15,AKS003,Alaska Premier - Individual,Yes,,,,,Individual,Yes



=== Analysis: Rate_PUF.csv ===
Shape: (2418441, 20)

First 5 rows:
Shape: (2418441, 20)

First 5 rows:


Unnamed: 0,BusinessYear,StateCode,IssuerId,SourceName,ImportDate,RateEffectiveDate,RateExpirationDate,PlanId,RatingAreaId,Tobacco,Age,IndividualRate,IndividualTobaccoRate,Couple,PrimarySubscriberAndOneDependent,PrimarySubscriberAndTwoDependents,PrimarySubscriberAndThreeOrMoreDependents,CoupleAndOneDependent,CoupleAndTwoDependents,CoupleAndThreeOrMoreDependents
0,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,0-14,65.0,,,,,,,,
1,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,15,65.0,,,,,,,,
2,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,16,65.0,,,,,,,,
3,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,17,65.0,,,,,,,,
4,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-12-31,21989AK0030001,Rating Area 1,No Preference,18,65.0,,,,,,,,



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2418441 entries, 0 to 2418440
Data columns (total 20 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   BusinessYear                               int64  
 1   StateCode                                  object 
 2   IssuerId                                   int64  
 3   SourceName                                 object 
 4   ImportDate                                 object 
 5   RateEffectiveDate                          object 
 6   RateExpirationDate                         object 
 7   PlanId                                     object 
 8   RatingAreaId                               object 
 9   Tobacco                                    object 
 10  Age                                        object 
 11  IndividualRate                             float64
 12  IndividualTobaccoRate                      float64
 13  Couple                             

CoupleAndThreeOrMoreDependents               99.975935
CoupleAndTwoDependents                       99.975935
CoupleAndOneDependent                        99.975935
PrimarySubscriberAndThreeOrMoreDependents    99.975935
PrimarySubscriberAndTwoDependents            99.975935
PrimarySubscriberAndOneDependent             99.975935
Couple                                       99.975935
IndividualTobaccoRate                        48.977668
Tobacco                                       0.024065
dtype: float64


Numeric summary (describe):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BusinessYear,2418441.0,2025.0,0.0,2025.0,2025.0,2025.0,2025.0,2025.0
IssuerId,2418441.0,48134.579972,25817.022775,10046.0,28856.0,41047.0,68781.0,99969.0
IndividualRate,2418441.0,462.745531,571.506368,0.0,30.36,410.57,684.53,9999.0
IndividualTobaccoRate,1233945.0,814.14021,471.354819,130.3,486.96,672.54,1026.0,5877.03
Couple,582.0,84.195515,37.955132,20.67,58.87,73.71,121.39,165.19
PrimarySubscriberAndOneDependent,582.0,94.154691,44.527228,26.24,60.0,76.0,140.07,190.61
PrimarySubscriberAndTwoDependents,582.0,115.440619,40.824494,26.24,95.0,121.635,147.89,190.61
PrimarySubscriberAndThreeOrMoreDependents,582.0,115.60299,41.065196,26.24,95.0,121.635,147.9875,222.0
CoupleAndOneDependent,582.0,147.454777,63.686461,39.42,99.1425,133.76,208.93,284.32
CoupleAndTwoDependents,582.0,147.605979,63.731694,39.42,99.1425,133.76,209.41,284.32



Top value counts for object columns (showing up to 10):

Column: StateCode — unique=31


StateCode
FL    572526
MI    243464
OH    215016
SC    208641
TX    191454
WI    159283
IN    128112
NC    118014
AL     66020
IL     61200
Name: count, dtype: int64


Column: SourceName — unique=2


SourceName
HIOS     1671926
SERFF     746515
Name: count, dtype: int64


Column: ImportDate — unique=110


ImportDate
2024-09-19 01:01:32    191352
2024-09-14 01:01:31    189108
2024-08-14 20:01:41    151980
2025-01-11 00:01:52    106590
2024-10-11 01:02:00    105774
2024-08-15 01:01:23     99425
2024-08-16 01:01:20     81345
2024-10-31 01:01:26     71196
2024-08-12 20:01:40     69582
2024-08-13 20:01:38     67850
Name: count, dtype: int64


Column: RateEffectiveDate — unique=4


RateEffectiveDate
2025-01-01    2136507
2025-07-01      94250
2025-04-01      93842
2025-10-01      93842
Name: count, dtype: int64


Column: RateExpirationDate — unique=4


RateExpirationDate
2025-12-31    2136507
2025-06-30      94250
2025-09-30      93842
2025-03-31      93842
Name: count, dtype: int64


Column: PlanId — unique=5734


PlanId
92388TX0190001    5508
28856IN0190004    3468
86728OH0290001    3468
69051IN0110001    3468
86728OH0300004    3468
86728OH0300002    3468
86728OH0300001    3468
86728OH0290006    3468
86728OH0290005    3468
86728OH0290002    3468
Name: count, dtype: int64


Column: RatingAreaId — unique=67


RatingAreaId
Rating Area 1     175718
Rating Area 4     155983
Rating Area 3     154234
Rating Area 2     143280
Rating Area 5     136713
Rating Area 6     132727
Rating Area 7     106048
Rating Area 8      99571
Rating Area 10     94063
Rating Area 11     90340
Name: count, dtype: int64


Column: Tobacco — unique=3


Tobacco
Tobacco User/Non-Tobacco User    1233945
No Preference                    1183914
NaN                                  582
Name: count, dtype: int64


Column: Age — unique=52


Age
0-14    47409
15      47409
42      47409
43      47409
44      47409
45      47409
46      47409
47      47409
48      47409
49      47409
Name: count, dtype: int64


Found rate-like columns: ['RateEffectiveDate', 'RateExpirationDate', 'IndividualRate', 'IndividualTobaccoRate']

Column RateEffectiveDate found but it's not numeric; showing value counts:


RateEffectiveDate
2025-01-01    2136507
2025-07-01      94250
2025-04-01      93842
2025-10-01      93842
Name: count, dtype: int64


Column RateExpirationDate found but it's not numeric; showing value counts:


RateExpirationDate
2025-12-31    2136507
2025-06-30      94250
2025-09-30      93842
2025-03-31      93842
Name: count, dtype: int64


Top 5 rows by IndividualRate:


Unnamed: 0,IndividualRate,BusinessYear,StateCode,IssuerId,SourceName
124226,9999.0,2025,AZ,86830,HIOS
124227,9999.0,2025,AZ,86830,HIOS
124228,9999.0,2025,AZ,86830,HIOS
124229,9999.0,2025,AZ,86830,HIOS
124230,9999.0,2025,AZ,86830,HIOS



Top 5 rows by IndividualTobaccoRate:


Unnamed: 0,IndividualTobaccoRate,BusinessYear,StateCode,IssuerId,SourceName
2404619,5877.03,2025,WV,50328,SERFF
2399009,5863.64,2025,WV,50328,SERFF
2404874,5817.39,2025,WV,50328,SERFF
2399264,5804.14,2025,WV,50328,SERFF
2404618,5783.0,2025,WV,50328,SERFF



Sample 5 rows with highest count of non-null numeric values:


Unnamed: 0,BusinessYear,StateCode,IssuerId,SourceName,ImportDate,RateEffectiveDate,RateExpirationDate,PlanId,RatingAreaId,Tobacco,Age,IndividualRate,IndividualTobaccoRate,Couple,PrimarySubscriberAndOneDependent,PrimarySubscriberAndTwoDependents,PrimarySubscriberAndThreeOrMoreDependents,CoupleAndOneDependent,CoupleAndTwoDependents,CoupleAndThreeOrMoreDependents
510,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-07-01,2025-09-30,21989AK0080001,Rating Area 1,,Family Option,54.45,,113.26,130.69,130.69,130.69,194.95,194.95,194.95
511,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-04-01,2025-06-30,21989AK0080001,Rating Area 1,,Family Option,53.79,,111.88,129.1,129.1,129.1,192.57,192.57,192.57
512,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-01-01,2025-03-31,21989AK0080001,Rating Area 1,,Family Option,53.13,,110.5,127.5,127.5,127.5,190.2,190.2,190.2
513,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-10-01,2025-12-31,21989AK0080001,Rating Area 1,,Family Option,55.12,,114.65,132.28,132.28,132.28,197.32,197.32,197.32
514,2025,AK,21989,HIOS,2024-08-29 01:02:15,2025-10-01,2025-12-31,21989AK0080001,Rating Area 2,,Family Option,52.36,,108.92,125.67,125.67,125.67,187.46,187.46,187.46


推荐只使用：

IndividualRate

IndividualTobaccoRate

Age

RatingArea

Plan attributes（来自 Plan PUF）

家庭字段可以在报告中注明 “not widely used in ACA pricing”