In [1]:
import pandas as pd

# Load CSV
claim_df = pd.read_csv("Inpatient_Claim.csv")

# Show column names
print(claim_df.columns)

# View sample rows
claim_df[['CLM_PMT_AMT', 'CLM_DRG_CD', 'CLM_FROM_DT', 'CLM_THRU_DT']].head()


Index(['DESYNPUF_ID', 'CLM_ID', 'SEGMENT', 'CLM_FROM_DT', 'CLM_THRU_DT',
       'PRVDR_NUM', 'CLM_PMT_AMT', 'NCH_PRMRY_PYR_CLM_PD_AMT', 'AT_PHYSN_NPI',
       'OP_PHYSN_NPI', 'OT_PHYSN_NPI', 'CLM_ADMSN_DT', 'ADMTNG_ICD9_DGNS_CD',
       'CLM_PASS_THRU_PER_DIEM_AMT', 'NCH_BENE_IP_DDCTBL_AMT',
       'NCH_BENE_PTA_COINSRNC_LBLTY_AM', 'NCH_BENE_BLOOD_DDCTBL_LBLTY_AM',
       'CLM_UTLZTN_DAY_CNT', 'NCH_BENE_DSCHRG_DT', 'CLM_DRG_CD',
       'ICD9_DGNS_CD_1', 'ICD9_DGNS_CD_2', 'ICD9_DGNS_CD_3', 'ICD9_DGNS_CD_4',
       'ICD9_DGNS_CD_5', 'ICD9_DGNS_CD_6', 'ICD9_DGNS_CD_7', 'ICD9_DGNS_CD_8',
       'ICD9_DGNS_CD_9', 'ICD9_DGNS_CD_10', 'ICD9_PRCDR_CD_1',
       'ICD9_PRCDR_CD_2', 'ICD9_PRCDR_CD_3', 'ICD9_PRCDR_CD_4',
       'ICD9_PRCDR_CD_5', 'ICD9_PRCDR_CD_6', 'HCPCS_CD_1', 'HCPCS_CD_2',
       'HCPCS_CD_3', 'HCPCS_CD_4', 'HCPCS_CD_5', 'HCPCS_CD_6', 'HCPCS_CD_7',
       'HCPCS_CD_8', 'HCPCS_CD_9', 'HCPCS_CD_10', 'HCPCS_CD_11', 'HCPCS_CD_12',
       'HCPCS_CD_13', 'HCPCS_CD_14', 'HCPCS_CD_15', 

Unnamed: 0,CLM_PMT_AMT,CLM_DRG_CD,CLM_FROM_DT,CLM_THRU_DT
0,4000.0,217,20100312.0,20100313.0
1,26000.0,201,20090412.0,20090418.0
2,5000.0,750,20090831.0,20090902.0
3,5000.0,883,20090917.0,20090920.0
4,16000.0,983,20100626.0,20100701.0


In [2]:
# Convert date columns
claim_df['CLM_FROM_DT'] = pd.to_datetime(claim_df['CLM_FROM_DT'], format='%Y%m%d', errors='coerce')
claim_df['CLM_THRU_DT'] = pd.to_datetime(claim_df['CLM_THRU_DT'], format='%Y%m%d', errors='coerce')

# Create new feature
claim_df['stay_length'] = (claim_df['CLM_THRU_DT'] - claim_df['CLM_FROM_DT']).dt.days

# Define features and target
features = ['CLM_UTLZTN_DAY_CNT', 'CLM_DRG_CD', 'NCH_BENE_IP_DDCTBL_AMT',
            'NCH_BENE_PTA_COINSRNC_LBLTY_AM', 'NCH_BENE_BLOOD_DDCTBL_LBLTY_AM', 'stay_length']

df_model = claim_df[features + ['CLM_PMT_AMT']].dropna()
df_model['CLM_DRG_CD'] = df_model['CLM_DRG_CD'].astype(str)
df_model = pd.get_dummies(df_model, columns=['CLM_DRG_CD'])


In [3]:
# Step 4: Train the Random Forest Regressor (Faster Version)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Split into features (X) and target (y)
X = df_model.drop('CLM_PMT_AMT', axis=1)
y = df_model['CLM_PMT_AMT']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Use a smaller sample of data for faster training
X_small = X_train[:3000]
y_small = y_train[:3000]

# ✅ Use fewer trees in the Random Forest
model = RandomForestRegressor(n_estimators=10, random_state=42)

# Train the model
model.fit(X_small, y_small)

# Evaluate performance
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Fast RMSE: {rmse:.2f}")


Fast RMSE: 9637.23


In [4]:
def predict_claim_amount(input_features_dict):
    # Convert the dictionary to DataFrame
    input_df = pd.DataFrame([input_features_dict])

    # One-hot encode just like training
    input_df = pd.get_dummies(input_df)

    # Align columns to training data
    input_df = input_df.reindex(columns=X.columns, fill_value=0)

    # Predict using trained model
    return model.predict(input_df)[0]


In [5]:
import random

def analyze_claim(claim):
    risk_score = claim['amount'] / 10000
    if claim['type'] == 'inpatient':
        risk_score += 1.5
    return min(risk_score, 10)

def predict_costs(claim, risk_score):
    legal_cost = 1000 + 200 * risk_score
    customer_impact_cost = (10 - risk_score) * 100
    return legal_cost, customer_impact_cost

def optimize_settlement(claim, legal_cost, customer_cost):
    litigation_risk = random.uniform(0.3, 0.8)
    expected_litigation_cost = legal_cost + (litigation_risk * claim['amount'])
    recommended_settlement = claim['amount'] * 0.6
    if expected_litigation_cost > recommended_settlement + customer_cost:
        return "Settle", recommended_settlement, expected_litigation_cost
    else:
        return "Litigate", 0, expected_litigation_cost

def recommend_action(claim):
    risk = analyze_claim(claim)
    legal_cost, customer_cost = predict_costs(claim, risk)
    action, settlement_offer, expected_cost = optimize_settlement(claim, legal_cost, customer_cost)
    return {
        "Claim Amount": round(claim['amount'], 2),
        "Claim Type": claim['type'],
        "Risk Score": round(risk, 2),
        "Legal Cost Estimate": round(legal_cost, 2),
        "Customer Impact Cost": round(customer_cost, 2),
        "Expected Litigation Cost": round(expected_cost, 2),
        "Recommended Action": action,
        "Settlement Offer": round(settlement_offer, 2) if action == "Settle" else "N/A"
    }


In [6]:
# Take a sample claim from test set
sample_input = X_test.iloc[0].to_dict()

# Predict claim amount using ML model
predicted_amt = predict_claim_amount(sample_input)

# Run settlement recommendation
claim_data = {
    'amount': predicted_amt,
    'type': 'inpatient'
}

print("\n=== AI-Powered Settlement Recommendation ===")
result = recommend_action(claim_data)
for key, value in result.items():
    print(f"{key}: {value}")



=== AI-Powered Settlement Recommendation ===
Claim Amount: 20700.0
Claim Type: inpatient
Risk Score: 3.57
Legal Cost Estimate: 1714.0
Customer Impact Cost: 643.0
Expected Litigation Cost: 13761.53
Recommended Action: Settle
Settlement Offer: 12420.0
