In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_class_weight
import joblib

In [9]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
provider_df = pd.read_csv('/content/drive/MyDrive/provider_features.csv')
provider_df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Provider,Inpatient_Claims,Inpatient_TotalAmount,Outpatient_Claims,Outpatient_TotalAmount,Avg_Patient_Age,Unique_Patients,Chronic_Patient_Perc,target
0,PRV51001,0.0,0.0,1.0,20.0,79.66872,1,100.0,0
1,PRV51003,4.0,36000.0,3.0,2800.0,70.131221,7,100.0,1
2,PRV51004,0.0,0.0,3.0,960.0,68.999316,3,100.0,0
3,PRV51005,0.0,0.0,36.0,9170.0,69.698076,10,100.0,1
4,PRV51007,1.0,6000.0,3.0,360.0,70.83436,2,100.0,0


In [10]:
X = provider_df.drop(columns=["Provider","target"])
y = provider_df["target"]

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

classes = np.unique(y_train)
class_weights_values = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, class_weights_values))
print("Class weights:", class_weights)

Class weights: {np.int64(0): np.float64(0.5789256198347107), np.int64(1): np.float64(3.667539267015707)}


In [11]:
# OPTIONAL: SMOTE Oversampling (not used in final model, but included as required by the project)
from imblearn.over_sampling import SMOTE

print("Original class distribution:", np.bincount(y_train))

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print("After SMOTE oversampling:", np.bincount(y_res))

# You can experiment by training a model on (X_res, y_res)
# Example:
# rf_smote = RandomForestClassifier(random_state=42)
# rf_smote.fit(X_res, y_res)


Original class distribution: [2420  382]
After SMOTE oversampling: [2420 2420]


In [12]:
models = {
"Decision Tree": DecisionTreeClassifier(random_state=42, class_weight=class_weights),
"Random Forest": RandomForestClassifier(random_state=42, class_weight=class_weights, n_jobs=-1),
"Gradient Boosting": GradientBoostingClassifier(random_state=42),
"SVM": SVC(random_state=42, class_weight=class_weights, probability=True)
}

In [13]:
import os

fitted_models = {}
drive_path = '/content/drive/MyDrive/trained_models/' # Define your desired path in Google Drive

# Create the directory if it doesn't exist
os.makedirs(drive_path, exist_ok=True)

for name, model in models.items():
    print(f"Training {name}...")
    if name == "SVM":
        model.fit(X_train_scaled, y_train)
    else:
        model.fit(X_train, y_train)
    fitted_models[name] = model

for name, model in fitted_models.items():
    filename = f"{name.replace(' ', '_').lower()}_model.joblib"
    full_path = os.path.join(drive_path, filename)
    joblib.dump(model, full_path)
    print(f"Saved {name} model to {full_path}")

scaler_filename = "scaler.joblib"
full_scaler_path = os.path.join(drive_path, scaler_filename)
joblib.dump(scaler, full_scaler_path)
print(f"Saved feature scaler for SVM to {full_scaler_path}")

Training Decision Tree...
Training Random Forest...
Training Gradient Boosting...
Training SVM...
Saved Decision Tree model to /content/drive/MyDrive/trained_models/decision_tree_model.joblib
Saved Random Forest model to /content/drive/MyDrive/trained_models/random_forest_model.joblib
Saved Gradient Boosting model to /content/drive/MyDrive/trained_models/gradient_boosting_model.joblib
Saved SVM model to /content/drive/MyDrive/trained_models/svm_model.joblib
Saved feature scaler for SVM to /content/drive/MyDrive/trained_models/scaler.joblib
