In [None]:
import pandas as pd
import numpy as np
import random
from faker import Faker
import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# create the 100000 sample dataset
def generate_fake_claims(n=10000):
    fake = Faker()
    data = []
    
    for _ in range(n):
        admit_date = fake.date_between(start_date='-2y', end_date='-1y')
        discharge_date = admit_date + datetime.timedelta(days=random.randint(1, 10))
        entry_date = discharge_date + datetime.timedelta(days=random.randint(1, 15))
        update_date = entry_date + datetime.timedelta(days=random.randint(1, 15))
        
        trans_amount = round(random.uniform(1000, 10000), 2)
        pay_amount = round(trans_amount * random.uniform(0.3, 0.9), 2)
        rev_amount = trans_amount - pay_amount
        delay = (update_date - entry_date).days > 10
        notes = fake.sentence(nb_words=10) + (" missing documents" if delay else " complete file")

        data.append({
            "AccountType": random.choice(["Inpatient", "Outpatient"]),
            "CurrentPrimaryInsurance": random.choice(["Insurer A", "Insurer B", "Insurer C"]),
            "CurrentFinancialClass": random.choice(["Private", "Govt", "Self-pay"]),
            "TransactionInsurance": random.choice(["Yes", "No"]),
            "BillType": random.choice(["Type A", "Type B", "Type C"]),
            "ClaimDeptResponsible": random.choice(["Cardiology", "Radiology", "Billing"]),
            "TransAmount": trans_amount,
            "PAYAmount": pay_amount,
            "REVAmount": rev_amount,
            "Notes": notes,
            "IsDelayed": int(delay)
        })
    return pd.DataFrame(data)

df = generate_fake_claims(100000)
df.head()
df["Notes"]

In [None]:
# the feature are the input data to the ml models

features = [
    "AccountType", "CurrentPrimaryInsurance", "CurrentFinancialClass",
    "TransactionInsurance", "BillType", "ClaimDeptResponsible",
    "TransAmount", "PAYAmount", "REVAmount", "Notes"
]

X = df[features]  # Features

# the output weather it was delay are not
y = df["IsDelayed"]  # Target


In [None]:
# columntransformer this will trainformthe colum to require colum type

preprocessor = ColumnTransformer([
    # stabdardsclare it was represending the numerical values
    ("num", StandardScaler(), ["TransAmount", "PAYAmount", "REVAmount"]),
    # when ever we use like option for the colums we can use the onwhorencoder
    ("cat", OneHotEncoder(), [
        "AccountType", "CurrentPrimaryInsurance", "CurrentFinancialClass",
        "TransactionInsurance", "BillType", "ClaimDeptResponsible"
    ]),
    #it will turn the nodes into the numicraical values
    ("txt", TfidfVectorizer(max_features=50), "Notes")
])


In [None]:
# Define and Train Models

# in the pipline we cotain the two main thing preproeceeor it will contain the streucture coilum we process the data in step 4 and classider contain the model to tarin

xgb_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42))
])

# it will contain the 80% of the data
xgb_model.fit(X_train, y_train)


In [None]:
# for randome forret method

rf_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
rf_model.fit(X_train, y_train)


In [None]:
# for  logical regression method

log_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=500))
])
log_model.fit(X_train, y_train)


In [None]:
models = {
    "XGBoost": xgb_model,
    "Random Forest": rf_model,
    "Logistic Regression": log_model
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n==== {name} ====")
    print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

y_pred = xgb_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["On-Time", "Delayed"])
disp.plot(cmap='Blues')
plt.title("XGBoost - Confusion Matrix")
plt.show()


In [None]:
import joblib

# Saving the trained model (XGBoost, Random Forest, etc.)
joblib.dump(xgb_model, 'sample_model_xgboost.pkl')  # You can name your file whatever you want
