# Health Insurance Member Risk Scoring Model

## Data Disclaimer
**All data, statistics, and examples in this notebook are synthetic and created for educational demonstration purposes only. No real member data, proprietary insurance company information, or actual healthcare outcomes are used.**

---

## Overview
This notebook demonstrates comprehensive health insurance member risk scoring using machine learning techniques to predict high-cost healthcare utilization and identify members requiring intervention.

## Business Applications
- **Care Management Targeting** for high-risk member identification
- **Cost Containment Strategies** through predictive analytics
- **Resource Allocation** based on risk stratification
- **Preventive Care Programs** for at-risk populations

## Key Methods
1. **Ensemble Machine Learning** - Random Forest, Gradient Boosting, Logistic Regression
2. **Feature Engineering** - Derived risk factors and utilization metrics
3. **Risk Stratification** - Multi-tier risk classification system
4. **Business Impact Analysis** - Cost concentration and intervention targeting

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Health Insurance Risk Scoring Model")
print("=" * 50)
print("Synthetic Data Only - No Real Member Information")
print("Libraries imported successfully!")

## 1. Synthetic Data Generation

We'll create realistic health insurance member data with key risk factors and utilization patterns:

In [None]:
# Generate synthetic data representative of typical insurance claims data
def generate_member_data(n_members=10000):
    """Generate synthetic member data with realistic healthcare patterns"""
    
    # Demographics
    ages = np.random.normal(45, 18, n_members)
    ages = np.clip(ages, 18, 85).astype(int)
    
    genders = np.random.choice(['M', 'F'], n_members, p=[0.48, 0.52])
    
    # Geographic regions (simplified)
    regions = np.random.choice(['Northeast', 'Southeast', 'Midwest', 'West'], 
                              n_members, p=[0.25, 0.28, 0.22, 0.25])
    
    # Plan types
    plan_types = np.random.choice(['HMO', 'PPO', 'EPO', 'POS'], 
                                 n_members, p=[0.35, 0.40, 0.15, 0.10])
    
    # Employment status affects utilization
    employment_status = np.random.choice(['Employed', 'Retired', 'Disabled', 'Unemployed'],
                                       n_members, p=[0.65, 0.20, 0.08, 0.07])
    
    # Previous year medical costs (strong predictor)
    prev_costs = np.random.lognormal(7.5, 1.8, n_members)
    prev_costs = np.clip(prev_costs, 0, 150000)
    
    # Chronic conditions (age-dependent probabilities)
    diabetes_prob = np.where(ages < 40, 0.05, np.where(ages < 65, 0.12, 0.25))
    has_diabetes = np.random.binomial(1, diabetes_prob)
    
    hypertension_prob = np.where(ages < 40, 0.08, np.where(ages < 65, 0.25, 0.45))
    has_hypertension = np.random.binomial(1, hypertension_prob)
    
    heart_disease_prob = np.where(ages < 50, 0.02, np.where(ages < 70, 0.08, 0.18))
    has_heart_disease = np.random.binomial(1, heart_disease_prob)
    
    copd_prob = np.where(ages < 50, 0.01, np.where(ages < 70, 0.05, 0.12))
    has_copd = np.random.binomial(1, copd_prob)
    
    mental_health_prob = np.where(ages < 30, 0.15, np.where(ages < 60, 0.12, 0.08))
    has_mental_health = np.random.binomial(1, mental_health_prob)
    
    # Healthcare utilization metrics
    primary_care_visits = np.random.poisson(3, n_members)
    specialist_visits = np.random.poisson(2 + has_diabetes + has_heart_disease + has_copd, n_members)
    er_visits = np.random.poisson(0.5 + 0.3 * has_diabetes + 0.4 * has_heart_disease, n_members)
    
    # Pharmacy utilization
    rx_count = np.random.poisson(5 + 3 * has_diabetes + 2 * has_hypertension + 
                                2 * has_heart_disease + has_mental_health, n_members)
    
    # BMI (affects risk)
    bmi = np.random.normal(28, 6, n_members)
    bmi = np.clip(bmi, 16, 50)
    
    # Smoking status
    smoking_prob = np.where(ages < 30, 0.18, np.where(ages < 65, 0.15, 0.08))
    is_smoker = np.random.binomial(1, smoking_prob)
    
    # Create target variable (high risk)
    # Complex risk calculation based on multiple factors
    risk_score = (
        0.3 * (prev_costs / 10000) +
        0.2 * (ages / 100) +
        0.15 * has_diabetes +
        0.12 * has_heart_disease +
        0.08 * has_copd +
        0.05 * (er_visits / 5) +
        0.05 * is_smoker +
        0.03 * (bmi - 25) / 10 +
        0.02 * (rx_count / 10)
    )
    
    # Add some noise and create binary target
    risk_score += np.random.normal(0, 0.1, n_members)
    high_risk = (risk_score > np.percentile(risk_score, 85)).astype(int)
    
    # Create DataFrame
    data = pd.DataFrame({
        'member_id': range(1, n_members + 1),
        'age': ages,
        'gender': genders,
        'region': regions,
        'plan_type': plan_types,
        'employment_status': employment_status,
        'prev_year_costs': prev_costs,
        'has_diabetes': has_diabetes,
        'has_hypertension': has_hypertension,
        'has_heart_disease': has_heart_disease,
        'has_copd': has_copd,
        'has_mental_health': has_mental_health,
        'primary_care_visits': primary_care_visits,
        'specialist_visits': specialist_visits,
        'er_visits': er_visits,
        'rx_count': rx_count,
        'bmi': bmi,
        'is_smoker': is_smoker,
        'high_risk': high_risk
    })
    
    return data

# Generate the dataset
print("Generating synthetic member data...")
df = generate_member_data(10000)

print(f"Dataset created with {len(df)} members")
print(f"High-risk members: {df['high_risk'].sum()} ({df['high_risk'].mean()*100:.1f}%)")
print("\nDataset Overview:")
df.head()