In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 200)

In [None]:
# ---------------------------
# 0. Configuration
# ---------------------------
DATA_DIR = Path("data")
BENEFICIARY_FILE = DATA_DIR / "Train_Beneficiarydata.csv"
INPATIENT_FILE   = DATA_DIR / "Train_Inpatientdata.csv"
OUTPATIENT_FILE  = DATA_DIR / "Train_Outpatientdata.csv"
LABELS_FILE      = DATA_DIR / "Train_labels.csv"

In [None]:
# ---------------------------
# 1. Helpers
# ---------------------------
def safe_read_csv(path):
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    df = pd.read_csv(path)
    print(f"Loaded '{path.name}' with shape {df.shape}")
    return df

def parse_dates(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors='coerce')

def print_missing_summary(df, name, top_n=10):
    miss = (df.isnull().mean()*100).sort_values(ascending=False)
    print(f"\nMissingness summary for {name} (top {top_n}):")
    print(miss.head(top_n).round(2))

def print_dup_counts(df, name):
    dup = df.duplicated().sum()
    print(f"{name}: {dup} duplicate rows")

def safe_info(df, name):
    print(f"\n-- {name} info --")
    print(df.info(verbose=False))
    print_missing_summary(df, name)
    print_dup_counts(df, name)

In [None]:
# ---------------------------
# 2. Load Data
# ---------------------------
bene_df = safe_read_csv(BENEFICIARY_FILE)
inp_df  = safe_read_csv(INPATIENT_FILE)
out_df  = safe_read_csv(OUTPATIENT_FILE)
prov_df = safe_read_csv(LABELS_FILE)

# Quick inspection (first rows)
print("\nBeneficiary sample:")
display(bene_df.head(2))
print("\nInpatient sample:")
display(inp_df.head(2))
print("\nOutpatient sample:")
display(out_df.head(2))
print("\nLabels sample:")
display(prov_df.head(2))


In [None]:
# ---------------------------
# 3. Basic Validation & Date Parsing
# ---------------------------
# We know inpatient/outpatient have: ClaimStartDt, ClaimEndDt, AdmissionDt, DischargeDt
date_cols_common = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt', 'DOB', 'DOD']
parse_dates(bene_df, date_cols_common)
parse_dates(inp_df, date_cols_common)
parse_dates(out_df, date_cols_common)

# Print basic info
safe_info(bene_df, "Beneficiary")
safe_info(inp_df,  "Inpatient")
safe_info(out_df,  "Outpatient")
safe_info(prov_df, "Labels")