In [None]:
# === STEP 1: Imports ===
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
import lightgbm as lgb

# === STEP 2: Load Data ===
claims = pd.read_csv("claims_data.csv")  # Columns: member_id, service_date, diagnosis_code, gender, dob



In [None]:

# === STEP 3: Preprocess Dates & Demographics ===
claims['service_date'] = pd.to_datetime(claims['service_date'])
claims['dob'] = pd.to_datetime(claims['dob'])
claims['age'] = (claims['service_date'] - claims['dob']).dt.days // 365

# Optional: cap age
claims = claims[claims['age'] < 100]


In [None]:
# === STEP 4: Define Chronic Condition (Label) ===
# Use a dictionary of ICD-10 codes for a condition, e.g., Diabetes
DIABETES_CODES = {'E08', 'E09', 'E10', 'E11', 'E13'}

def has_diabetes(code):
    return any(code.startswith(d) for d in DIABETES_CODES)

claims['has_diabetes'] = claims['diagnosis_code'].apply(has_diabetes)


In [None]:
# === STEP 5: Create Prediction Windows ===
# Define observation and prediction windows
obs_window_months = 12
pred_window_months = 6

# Get minimum service date per member
min_dates = claims.groupby('member_id')['service_date'].min().reset_index()
min_dates.columns = ['member_id', 'min_service_date']

# Merge to get index date per member
claims = claims.merge(min_dates, on='member_id')
claims['index_date'] = claims['min_service_date'] + pd.DateOffset(months=obs_window_months)

# Filter claims in obs or pred window
claims['in_obs_window'] = claims['service_date'] <= claims['index_date']
claims['in_pred_window'] = (claims['service_date'] > claims['index_date']) & (
    claims['service_date'] <= claims['index_date'] + pd.DateOffset(months=pred_window_months))

In [None]:
# === STEP 6: Feature Engineering ===
# For each member, count diagnosis codes in observation window
obs_claims = claims[claims['in_obs_window']]

# Create a pivot table of diagnosis frequencies
dx_counts = obs_claims.groupby(['member_id', 'diagnosis_code']).size().unstack(fill_value=0)
demographics = obs_claims.groupby('member_id')[['age', 'gender']].first()

features = demographics.join(dx_counts)

In [None]:
# === STEP 7: Create Labels ===
# If member has a diabetes diagnosis in prediction window
pred_claims = claims[claims['in_pred_window']]
labels = pred_claims.groupby('member_id')['has_diabetes'].max().fillna(0).astype(int)

In [None]:
# === STEP 8: Train/Test Split ===
X = features
y = labels.reindex(features.index).fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
# === STEP 9: Train Model ===
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)


In [None]:
# === STEP 10: Evaluate ===
y_pred = model.predict_proba(X_test)[:, 1]
print("ROC AUC:", roc_auc_score(y_test, y_pred))
print(classification_report(y_test, (y_pred > 0.5).astype(int)))