In [1]:
import pandas as pd
final_df = pd.read_csv(r"C:\Users\Anuj Bohra\Desktop\ArogoAI\Severity\combined_cleaned_df_final.csv")

In [2]:
final_df = final_df.drop('Unnamed: 0', axis=1)
from sklearn.preprocessing import LabelEncoder

gender_encoder = LabelEncoder()

final_df['gender'] = gender_encoder.fit_transform(final_df['gender'])
df_icd9 = final_df[final_df['icd_version'] == 9]

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
icd_encoded = encoder.fit_transform(df_icd9[['icd_code']])
icd_encoded_df = pd.DataFrame(
    icd_encoded,
    columns=encoder.get_feature_names_out(['icd_code'])
)
X = df_icd9.drop(['hadm_id', 'icd_code', 'subject_id'], axis=1)
y = icd_encoded_df
y_labels = y.idxmax(axis=1)
X.columns = X.columns.astype(str)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() 
X_scaled = scaler.fit_transform(X)

import numpy as np
y_class = np.argmax(y.values, axis=1)

In [4]:
import numpy as np
import joblib
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleNet(nn.Module):
    def __init__(self, input_size=25, hidden1=256, hidden2=512, hidden3=256, hidden4=128, num_classes=1203):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden2, hidden3)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(hidden3, hidden4)
        self.relu4 = nn.ReLU()
        self.fc5 = nn.Linear(hidden4, num_classes)
    
    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.relu3(self.fc3(x))
        x = self.relu4(self.fc4(x))
        x = self.fc5(x)  # Raw logits (will apply softmax later)
        return x

# ------------------------------------------------------------------------------
# 2. Load the pre-trained models
# ------------------------------------------------------------------------------

# Load XGBoost model from a pickle file
xgb_model_path = r'C:\Users\Anuj Bohra\Desktop\ArogoAI\Severity\Model\xgb_model.pkl'  # Update with your actual file path
loaded_xgb = joblib.load(xgb_model_path)

# Load PyTorch model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pytorch_model_path = r'C:\Users\Anuj Bohra\Desktop\ArogoAI\Severity\Model\simple_net_state_dict.pth'  # Update with your actual file path
pytorch_model = SimpleNet(input_size=25, num_classes=1203).to(device)
pytorch_model.load_state_dict(torch.load(pytorch_model_path, map_location=device))
pytorch_model.eval()

# ------------------------------------------------------------------------------
# 3. Define the ensemble prediction functions
# ------------------------------------------------------------------------------

def ensemble_predict(X, weights=None):
    """
    Returns the ensemble's top-1 prediction for each sample.
    This is useful for computing overall accuracy.
    
    Args:
        X (np.array): Input feature array of shape (n_samples, n_features)
        weights (list or tuple): Weights for the two models [w_xgb, w_pt].
                                 Defaults to equal weighting if None.
    
    Returns:
        final_preds (np.array): Array of shape (n_samples,) with the predicted class labels.
    """
    if weights is None:
        weights = [1/2, 1/2]
    w_xgb, w_pt = weights

    # XGBoost: Get probability distribution
    xgb_probs = loaded_xgb.predict_proba(X)  # Shape: (n_samples, num_classes)
    
    # PyTorch: Get probability distribution (apply softmax to logits)
    X_tensor = torch.from_numpy(X).float().to(device)
    with torch.no_grad():
        outputs = pytorch_model(X_tensor)
        pt_probs = F.softmax(outputs, dim=1).cpu().numpy()  # Shape: (n_samples, num_classes)
    
    # Compute weighted sum of probabilities
    ensemble_probs =  w_xgb * xgb_probs + w_pt * pt_probs

    # Return the class with the highest probability for each sample
    final_preds = np.argmax(ensemble_probs, axis=1)
    return final_preds

def ensemble_topk(X, top_k=3, weights=None):
    """
    Returns the top-k predictions (class indices and their probabilities) for each sample.
    
    Args:
        X (np.array): Input feature array of shape (n_samples, n_features)
        top_k (int): Number of top predictions to return.
        weights (list or tuple): Weights for the two models [w_xgb, w_pt].
                                 Defaults to equal weighting if None.
    
    Returns:
        top_k_indices (np.array): Array of shape (n_samples, top_k) with top-k class indices.
        top_k_probs (np.array): Array of shape (n_samples, top_k) with corresponding probabilities.
    """
    if weights is None:
        weights = [1/2, 1/2]
    w_xgb, w_pt = weights

    # Get probability distributions from each model
    xgb_probs = loaded_xgb.predict_proba(X)  # Shape: (n_samples, num_classes)
    X_tensor = torch.from_numpy(X).float().to(device)
    with torch.no_grad():
        outputs = pytorch_model(X_tensor)
        pt_probs = F.softmax(outputs, dim=1).cpu().numpy()  # Shape: (n_samples, num_classes)

    # Compute weighted sum of probabilities
    ensemble_probs =  w_xgb * xgb_probs + w_pt * pt_probs

    # For each sample, retrieve the indices of the top k probabilities.
    top_k_indices = np.argsort(ensemble_probs, axis=1)[:, -top_k:][:, ::-1]
    top_k_probs = np.take_along_axis(ensemble_probs, top_k_indices, axis=1)
    
    return top_k_indices, top_k_probs

# ------------------------------------------------------------------------------
# 4. Example usage
# ------------------------------------------------------------------------------

# Assuming X_scaled is your NumPy array of input features:
# For instance:
# X_scaled = np.load('X_scaled.npy')

# # Get top-1 predictions (for accuracy computation)
# final_predictions = ensemble_predict(X_scaled, weights=[0.4, 0.3, 0.3])
# print("Final predictions (top-1) for each sample:\n", final_predictions)

# # # Get top-3 predictions (with probabilities)
# top_k_indices, top_k_probs = ensemble_topk(X_scaled, top_k=3, weights=[0.4, 0.3, 0.3])
# print("Top-3 class indices for each sample:\n", top_k_indices)
# print("Corresponding probabilities for top-3 predictions:\n", top_k_probs)


NameError: name '_C' is not defined

In [None]:
# Get top-1 predictions (for accuracy computation)
final_predictions = ensemble_predict(X_scaled, weights=[0.7, 0.3])

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_class, final_predictions)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
tests = list(final_df.columns[5:])
tests

In [None]:
y

In [None]:
icd_codes = []
for i in y.columns:
    icd_codes.append(i[9:])
icd_codes

In [None]:
icd_df = pd.read_csv(r'C:\Users\Anuj Bohra\Desktop\ArogoAI\Severity\d_icd_diagnoses.csv')
icd_df = icd_df[icd_df['icd_version'] == 9]

In [None]:
icd_df.tail()

In [None]:
# # Get top-3 predictions (with probabilities)
top_k_indices, top_k_probs = ensemble_topk(X_scaled, top_k=3, weights=[0.7, 0.3])

In [None]:
top_k_probs[:5]

In [None]:
def format_ensemble_predictions(top_k_indices, y_labels, icd_df, top_k_probs):
    """
    Given an array of top k prediction indices for each sample,
    a list of y_labels mapping indices to ICD codes, and a dataframe containing
    ICD code details (with columns "icd_code" and "long_title"), returns a list
    of formatted strings describing the predictions in natural language.

    Args:
        top_k_indices (np.array): Array of shape (n_samples, top_k) with predicted class indices.
        y_labels (list): List of ICD code strings, mapping model output indices to ICD codes.
        icd_df (pd.DataFrame): DataFrame with columns "icd_code" and "long_title".
        
    Returns:
        List[str]: A list of formatted strings, one for each sample.
    """
    formatted_outputs = []
    n_samples, top_k = top_k_indices.shape
    
    for i in range(1):
        # Convert predicted indices to ICD codes.
        predicted_codes = [y_labels[idx] for idx in top_k_indices[i]]
        # Lookup the long_title for each ICD code.
        titles = []
        probs = []
        ct = 0
        for code in predicted_codes:
            # Find the row with the matching ICD code.
            match = icd_df[icd_df['icd_code'] == code]
            if not match.empty:
                title = match.iloc[0]['long_title']
            else:
                title = "Unknown condition"
            titles.append(title)
            probs.append(top_k_probs[i][ct])
            ct+=1
        
        # Format a natural language string.
        formatted_str = f"Sample {i+1}: The top {top_k} predicted diagnoses are:\n"
        for rank, (code, title) in enumerate(zip(predicted_codes, titles), start=1):
            prob_this = float("{:.2f}".format(probs[rank-1]*100))
            formatted_str += f"  {rank}. {title} (ICD Code: {code}) with Probability : {prob_this}%\n"
        
        formatted_outputs.append(formatted_str)
        
    return formatted_outputs

# Example usage:
# Assume top_k_indices is obtained from ensemble_topk, y_labels is your list of ICD codes,
# and icd_df is your DataFrame.
formatted_results = format_ensemble_predictions(top_k_indices, icd_codes, icd_df, top_k_probs)

# Print the formatted output for each sample.
for result in formatted_results:
    print(result)

In [None]:
icd_df[icd_df['icd_code'] == '5723']['long_title']

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score

# 1. Compute the Confusion Matrix (for the top prediction)
conf_matrix = confusion_matrix(y_class, top_k_indices[:, 0])  # Compare true labels with the top-1 predictions
print("Confusion Matrix:")
print(conf_matrix)

# 2. Compute Top-K Accuracy
# For top-3 accuracy, we check if the true label is in the top 3 predictions for each sample
top_k_accuracy = np.mean([y_class[i] in top_k_indices[i] for i in range(len(y_class))])
print(f"Top-3 Accuracy: {top_k_accuracy * 100:.2f}%")

# If you also want to calculate top-1 accuracy (which is simple accuracy):
top_1_accuracy = accuracy_score(y_class, top_k_indices[:, 0])
print(f"Top-1 Accuracy: {top_1_accuracy * 100:.2f}%")


In [None]:
import matplotlib.pyplot as plt
import xgboost as xgb

xgb_importance = loaded_xgb.get_booster().get_score(importance_type='weight')
importances_df = pd.DataFrame(list(xgb_importance.items()), columns=['Feature', 'Importance'])
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Plot top features
plt.figure(figsize=(10, 6))
plt.barh(importances_df['Feature'][:20], importances_df['Importance'][:20])  # Top 20 features
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("XGBoost Feature Importance")
plt.gca().invert_yaxis()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# to extract names of features instead of numbers
feature_names = X.columns  
xgb_importance = loaded_xgb.get_booster().get_score(importance_type='weight')

#dataframe for plottting
importances_df = pd.DataFrame({
    'Feature': [feature_names[int(k[1:])] if k[1:].isdigit() else k for k in xgb_importance.keys()],  # Convert indices to names
    'Importance': list(xgb_importance.values())
})
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Plot top features
plt.figure(figsize=(10, 6))
plt.barh(importances_df['Feature'][:20], importances_df['Importance'][:20])  # 20 features ka 
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("XGBoost Feature Importance")
plt.gca().invert_yaxis()
plt.show()


In [None]:
lab_df = pd.read_csv(r'C:\Users\Anuj Bohra\Desktop\ArogoAI\Severity\d_labitems.csv')
lab_test_name = lab_df[lab_df['itemid'] == 51265]
print(lab_test_name)

In [None]:
import captum

In [None]:
import shap
import torch
import numpy as np
import matplotlib.pyplot as plt
from captum.attr import IntegratedGradients

# Set ensemble weights
w_xgb = 0.4
w_pt = 0.3

# Define the sample (assuming X_test[0] is the sample)
single_sample = X_scaled[0:1]  # Shape: (1, num_features)
single_sample_tensor = torch.tensor(single_sample, dtype=torch.float32).to(device)

# -------------------- 1. SHAP for XGBoost --------------------
explainer_xgb = shap.TreeExplainer(loaded_xgb)
shap_values_xgb = explainer_xgb.shap_values(single_sample)[0]  # Shape: (num_features,)

# -------------------- 2. Integrated Gradients for NN --------------------
ig = IntegratedGradients(pytorch_model)
baseline = torch.zeros_like(single_sample_tensor)  # Baseline (zero input)
attr, _ = ig.attribute(single_sample_tensor, baseline, target=None, return_convergence_delta=True)
attr = attr.cpu().detach().numpy().flatten()  # Convert to NumPy

# -------------------- 3. Weight & Combine Importance --------------------
combined_importance = (w_xgb * shap_values_xgb) + (w_pt * attr)

# -------------------- 4. Waterfall Plot --------------------
feature_names = [f"Feature {i}" for i in range(single_sample.shape[1])]

shap.waterfall_plot(shap.Explanation(
    values=combined_importance,
    base_values=0,  # Adjust if needed
    feature_names=feature_names
))
plt.show()