# Read KFF Data Files

This notebook reads all data files under `2433_p3_data/KFF_data/` directory and displays the first 5 rows of each file.

KFF (Kaiser Family Foundation) data files contain healthcare-related statistics and information.

In [45]:
# Import required libraries
import pandas as pd
from pathlib import Path
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

# pandas display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 50)

In [46]:
# Define KFF data directory
KFF_DATA_DIR = Path('2433_p3_data/KFF_data')

# Check directory exists
if not KFF_DATA_DIR.exists():
    print(f"ERROR: directory {KFF_DATA_DIR} not found!")
else:
    print(f"✓ Found data directory: {KFF_DATA_DIR}")
    print(f"  Absolute path: {KFF_DATA_DIR.absolute()}")

✓ Found data directory: 2433_p3_data/KFF_data
  Absolute path: /Users/mac/Desktop/Lecture_2433/group_project_251117/2433_p3_data/KFF_data


In [47]:
# Read all CSV files into dictionary (robust reading, skip metadata rows and auto-detect header)
kff_data = {}
errors = {}

# Build list of data files to read. Include all .csv under KFF_DATA_DIR but exclude the exports folder
data_files = sorted([p for p in KFF_DATA_DIR.rglob('*.csv') if 'exports' not in p.parts])
if len(data_files) == 0:
    print(f"⚠ No CSV files found under {KFF_DATA_DIR}. Check that files exist and path is correct.")
else:
    print(f"Found {len(data_files)} CSV files to read (sample: {data_files[:5]} )")

def robust_read_csv(path):
    """Try robust reading: skip leading metadata lines, detect header line, then read with pandas."""
    # Read first few lines to detect header position
    with open(path, 'r', encoding='utf-8') as fh:
        preview = []
        for _ in range(40):
            try:
                preview.append(next(fh))
            except StopIteration:
                break

    header_idx = None
    for i, line in enumerate(preview):
        # header likely contains Location or multiple commas
        if 'Location' in line or line.count(',') >= 2:
            header_idx = i
            break

    # If header_idx found and >0, skip that many rows so pandas uses next as header
    if header_idx is not None and header_idx > 0:
        df = pd.read_csv(path, skiprows=header_idx, low_memory=False)
    else:
        # Try normal read; if it fails, try engine='python' with flexible sep
        try:
            df = pd.read_csv(path, low_memory=False)
        except Exception:
            df = pd.read_csv(path, sep='\t', engine='python', low_memory=False)
    return df

for file_path in data_files:
    file_name = file_path.name
    try:
        print(f"Reading: {file_name}...", end=' ')
        df = robust_read_csv(file_path)
        kff_data[file_name] = df
        print(f"✓ Success (shape: {df.shape})")
    except Exception as e:
        errors[file_name] = str(e)
        print(f"✗ Failed: {e}")

print(f"\nTotal: Successfully read {len(kff_data)} files, {len(errors)} failed")

Found 9 CSV files to read (sample: [PosixPath('2433_p3_data/KFF_data/raw_data_2018.csv'), PosixPath('2433_p3_data/KFF_data/raw_data_2019.csv'), PosixPath('2433_p3_data/KFF_data/raw_data_2020.csv'), PosixPath('2433_p3_data/KFF_data/raw_data_2021.csv'), PosixPath('2433_p3_data/KFF_data/raw_data_2022.csv')] )
Reading: raw_data_2018.csv... ✓ Success (shape: (56, 5))
Reading: raw_data_2019.csv... ✓ Success (shape: (56, 5))
Reading: raw_data_2020.csv... ✓ Success (shape: (56, 5))
Reading: raw_data_2021.csv... ✓ Success (shape: (56, 5))
Reading: raw_data_2022.csv... ✓ Success (shape: (56, 5))
Reading: raw_data_2023.csv... ✓ Success (shape: (56, 5))
Reading: raw_data_2024.csv... ✓ Success (shape: (56, 5))
Reading: raw_data_2025.csv... ✓ Success (shape: (56, 5))
Reading: raw_data_2026.csv... ✓ Success (shape: (56, 5))

Total: Successfully read 9 files, 0 failed


## Data Overview Summary

In [48]:
# Create data overview summary table
summary_data = []

for file_name, df in sorted(kff_data.items()):
    summary_data.append({
        'Filename': file_name,
        'Rows': df.shape[0],
        'Columns': df.shape[1],
        'Total Missing': df.isnull().sum().sum(),
        'Missing Rate(%)': f"{(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100):.2f}%",
        'Memory(MB)': f"{df.memory_usage(deep=True).sum() / 1024 / 1024:.2f}"
    })

summary_df = pd.DataFrame(summary_data)

print("="*100)
print("KFF Data Files Summary")
print("="*100)
display(summary_df)

print(f"\nTotal rows: {summary_df['Rows'].sum():,}")
print(f"Average columns: {summary_df['Columns'].mean():.1f}")

KFF Data Files Summary


Unnamed: 0,Filename,Rows,Columns,Total Missing,Missing Rate(%),Memory(MB)
0,raw_data_2018.csv,56,5,16,5.71%,0.01
1,raw_data_2019.csv,56,5,16,5.71%,0.01
2,raw_data_2020.csv,56,5,16,5.71%,0.01
3,raw_data_2021.csv,56,5,16,5.71%,0.01
4,raw_data_2022.csv,56,5,16,5.71%,0.01
5,raw_data_2023.csv,56,5,16,5.71%,0.01
6,raw_data_2024.csv,56,5,16,5.71%,0.01
7,raw_data_2025.csv,56,5,16,5.71%,0.01
8,raw_data_2026.csv,56,5,16,5.71%,0.01



Total rows: 504
Average columns: 5.0


## Check Data Consistency

In [49]:
# Check if all files have the same column structure
if len(kff_data) > 0:
    first_file = list(kff_data.keys())[0]
    first_columns = set(kff_data[first_file].columns)
    
    print("Column structure consistency check:")
    all_same = True
    
    for file_name, df in kff_data.items():
        current_columns = set(df.columns)
        if current_columns == first_columns:
            print(f"  ✓ {file_name}: Consistent column structure")
        else:
            all_same = False
            print(f"  ✗ {file_name}: Different column structure")
            missing = first_columns - current_columns
            extra = current_columns - first_columns
            if missing:
                print(f"    Missing columns: {missing}")
            if extra:
                print(f"    Extra columns: {extra}")
    
    if all_same:
        print("\n✓ All files have consistent column structure, safe to merge!")
    else:
        print("\n⚠ Files have inconsistent column structure, alignment needed before merging.")

Column structure consistency check:
  ✓ raw_data_2018.csv: Consistent column structure
  ✓ raw_data_2019.csv: Consistent column structure
  ✓ raw_data_2020.csv: Consistent column structure
  ✓ raw_data_2021.csv: Consistent column structure
  ✓ raw_data_2022.csv: Consistent column structure
  ✓ raw_data_2023.csv: Consistent column structure
  ✓ raw_data_2024.csv: Consistent column structure
  ✓ raw_data_2025.csv: Consistent column structure
  ✓ raw_data_2026.csv: Consistent column structure

✓ All files have consistent column structure, safe to merge!


## Optional: Merge All Years Data

In [54]:
# If all files have consistent column structure, merge them and add Year column
import re

try:
    # Add Year column to each DataFrame before merging
    df_list = []
    for file_name, df in kff_data.items():
        df_copy = df.copy()
        # Extract year from filename (e.g., "raw_data_2020.csv" -> 2020)
        year_match = re.search(r'(\d{4})', file_name)
        if year_match:
            df_copy['Year'] = int(year_match.group(1))
        else:
            df_copy['Year'] = None  # If year not found in filename
        df_list.append(df_copy)
    
    combined_df = pd.concat(df_list, ignore_index=True)
    print(f"✓ Successfully merged all files")
    print(f"  Combined shape: {combined_df.shape[0]:,} rows × {combined_df.shape[1]} columns")
    
    # Display year distribution
    if 'Year' in combined_df.columns:
        print(f"\nYear distribution:")
        print(combined_df['Year'].value_counts().sort_index())
    
    # Store merged data to global variable
    globals()['kff_combined'] = combined_df
    print("\n✓ Merged data saved to variable 'kff_combined'")
    print("  Note: 'Year' column added to identify data source year")
    
except Exception as e:
    print(f"✗ Merge failed: {e}")
    print("  Files may have inconsistent column structure, check column structure first.")

✓ Successfully merged all files
  Combined shape: 504 rows × 6 columns

Year distribution:
Year
2018    56
2019    56
2020    56
2021    56
2022    56
2023    56
2024    56
2025    56
2026    56
Name: count, dtype: int64

✓ Merged data saved to variable 'kff_combined'
  Note: 'Year' column added to identify data source year


## Export Data (Optional)

In [55]:
# Export kff_combined to CSV file
if 'kff_combined' in globals():
    output_dir = Path('2433_p3_data/KFF_data/exports')
    output_dir.mkdir(parents=True, exist_ok=True)
    
    output_file = output_dir / 'kff_combined_2018_2026.csv'
    kff_combined.to_csv(output_file, index=False)
    
    file_size_mb = output_file.stat().st_size / 1024 / 1024
    print(f"✓ kff_combined exported to: {output_file}")
    print(f"  Rows: {len(kff_combined):,}")
    print(f"  Columns: {kff_combined.shape[1]}")
    print(f"  File size: {file_size_mb:.2f} MB")
else:
    print("⚠ kff_combined not found. Please run the merge cell first.")

✓ kff_combined exported to: 2433_p3_data/KFF_data/exports/kff_combined_2018_2026.csv
  Rows: 504
  Columns: 6
  File size: 0.02 MB


In [56]:
# Save all data to global variables for subsequent use
globals()['kff_data'] = kff_data
print(f"✓ All KFF data saved to variable 'kff_data' (dictionary type)")
print(f"  Available files: {list(kff_data.keys())}")
print(f"\nUsage examples:")
print("  - Access 2020 data: kff_data['raw_data_2020.csv']")
print("  - Access merged data: kff_combined (if merge has been executed)")

✓ All KFF data saved to variable 'kff_data' (dictionary type)
  Available files: ['raw_data_2018.csv', 'raw_data_2019.csv', 'raw_data_2020.csv', 'raw_data_2021.csv', 'raw_data_2022.csv', 'raw_data_2023.csv', 'raw_data_2024.csv', 'raw_data_2025.csv', 'raw_data_2026.csv']

Usage examples:
  - Access 2020 data: kff_data['raw_data_2020.csv']
  - Access merged data: kff_combined (if merge has been executed)


In [57]:
# Load the exported combined KFF CSV and display the first 5 rows
from pathlib import Path
combined_path = Path('2433_p3_data/KFF_data/exports/kff_combined_2018_2026.csv')

if combined_path.exists():
    df_combined = pd.read_csv(combined_path)
    print(f"✓ Loaded: {combined_path} (shape: {df_combined.shape})")
    display(df_combined.head(5))
else:
    print(f"⚠ File not found: {combined_path}. If you haven't exported it yet, run the export cell above.")

✓ Loaded: 2433_p3_data/KFF_data/exports/kff_combined_2018_2026.csv (shape: (504, 6))


Unnamed: 0,Location,Average Lowest-Cost Bronze Premium,Average Lowest-Cost Silver Premium,Average Benchmark Premium,Average Lowest-Cost Gold Premium,Year
0,United States,$341,$456,$481,$526,2018
1,Alabama,$354,$516,$558,$583,2018
2,Alaska,$540,$699,$726,$779,2018
3,Arizona,$397,$487,$516,$629,2018
4,Arkansas,$296,$340,$364,$409,2018
