In [4]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

In [5]:

ids_file = "IDS_mapping.csv"
data_file = "diabetic_data.csv"

In [6]:
def load_data(ids_file, data_file):

    if not os.path.exists(ids_file) or not os.path.exists(data_file):
        raise FileNotFoundError(f"One or more input files not found: {ids_file}, {data_file}")

    ids_df = pd.read_csv(ids_file)
    df = pd.read_csv(data_file)
    print(df.head())  # Print the first few rows to check if the file is loaded correctly
    print(f"IDS DataFrame shape: {ids_df.shape}")
    print(f"Main DataFrame shape: {df.shape}")
    return ids_df, df


def split_ids_mapping(ids_df):
    admission_type_id_df = ids_df.iloc[:8]
    discharge_disposition_id_df = ids_df.iloc[10:40].rename(columns={'admission_type_id': 'discharge_disposition_id'})
    admission_source_id_df = ids_df.iloc[42:].rename(columns={'admission_type_id': 'admission_source_id'})
    return admission_type_id_df, discharge_disposition_id_df, admission_source_id_df


def clean_data(df):
    # Drop high null value columns
    df = df.drop(df[df['gender'] == 'Unknown/Invalid'].index)
    df.drop(['max_glu_serum', 'A1Cresult', 'weight', 'payer_code', 'medical_specialty', 'encounter_id', 'race', 'patient_nbr'], axis=1, inplace=True)
    df.replace('?', np.nan, inplace=True)
    df.dropna(inplace=True)

    # Check if the DataFrame is empty
    if df.empty:
        print("DataFrame is empty after cleaning. Please check the data source.")
        return None  # Return None to indicate an issue
    return df


def drop_low_variance_columns(df, threshold=0.999999999):
    """
    Drops low-variance categorical columns from the DataFrame and returns the modified DataFrame
    along with the list of dropped columns.

    Parameters:
        df (pd.DataFrame): Input DataFrame
        threshold (float): The maximum proportion of the most common category value
                           to consider as low variance.

    Returns:
        pd.DataFrame: DataFrame with low-variance columns removed.
        list: List of dropped column names.
    """
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns

    # Find low-variance columns
    low_variance_cols = [
        col for col in categorical_columns
        if df[col].value_counts(normalize=True).max() > threshold
    ]

    print(f"Low-variance columns: {low_variance_cols}")

    # Drop low-variance columns from the DataFrame
    df = df.drop(low_variance_cols, axis=1)

    return df, low_variance_cols



def aggregate_service_utilization(df):
    required_columns = ['number_outpatient', 'number_emergency', 'number_inpatient']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise KeyError(f"Missing required columns for aggregation: {missing_columns}")

    df['service_utilization'] = df['number_outpatient'] + df['number_emergency'] + df['number_inpatient']
    df.drop(['number_outpatient', 'number_emergency', 'number_inpatient'], axis=1, inplace=True)
    return df


def encode_drug_changes(df, keys):
    for col in keys:
        colname = f"{col}temp"
        df[colname] = df[col].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)

    df['numchange'] = 0
    for col in keys:
        colname = f"{col}temp"
        df['numchange'] += df[colname]
        df.drop(colname, axis=1, inplace=True)
    return df

def map_admission_and_discharge(df):
    # Map `admission_type_id`
    admission_type_mapping = {2: 1, 7: 1, 6: 5, 8: 5}
    df['admission_type_id'] = df['admission_type_id'].replace(admission_type_mapping)

    # Map `discharge_disposition_id`
    discharge_mapping = {
        6: 1, 8: 1, 9: 1, 13: 1,
        3: 2, 4: 2, 5: 2, 14: 2, 22: 2, 23: 2, 24: 2,
        12: 10, 15: 10, 16: 10, 17: 10,
        25: 18, 26: 18
    }
    df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(discharge_mapping)
    return df

def map_admission_source(df):
    admission_source_mapping = {
        2: 1, 3: 1,
        5: 4, 6: 4, 10: 4, 22: 4, 25: 4,
        15: 9, 17: 9, 20: 9, 21: 9,
        13: 11, 14: 11
    }
    df['admission_source_id'] = df['admission_source_id'].replace(admission_source_mapping)
    return df

def encode_categorical_columns(df):
    df['change'] = df['change'].replace({'Ch': 1, 'No': 0})
    df['gender'] = df['gender'].replace({'Male': 1, 'Female': 0})
    # df = df.drop(df[df['gender'] == 'Unknown/Invalid'].index)
    df['diabetesMed'] = df['diabetesMed'].replace({'Yes': 1, 'No': 0})
    return df

def encode_drugs(df, keys):
    for col in keys:
        df[col] = df[col].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})
    return df

def encode_age(df):
    for i in range(0, 10):
        df['age'] = df['age'].replace(f'[{10*i}-{10*(i+1)})', i+1)
    return df

def preprocess_diagnosis(df):
    for diag in ['diag_1', 'diag_2', 'diag_3']:
        level1_col = f'level1_{diag}'
        level2_col = f'level2_{diag}'
        df[level1_col] = df[diag].replace(r'[VE].*', 0, regex=True).astype(float)
        df[level2_col] = df[level1_col]
    df.drop(['diag_1', 'diag_2', 'diag_3'], axis=1, inplace = True)
    return df

def main_preprocess_fn():

    # Load data
    ids_df, df = load_data(ids_file, data_file)

    # Split mappings
    _, _, _ = split_ids_mapping(ids_df)

    # Clean and preprocess data
    print(f"Initial DataFrame shape: {df.shape}")
    df = clean_data(df)
    print(f"After cleaning: {df.shape}")

    df, dropped_columns = drop_low_variance_columns(df)
    print(f"Before aggregation, df type: {type(df)}, shape: {df.shape if df is not None else 'N/A'}")
    df = aggregate_service_utilization(df)
    print(f"After aggregation, df type: {type(df)}, shape: {df.shape if df is not None else 'N/A'}")
    # # Drug change encoding
    # keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
    #         'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose',
    #         'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide',
    #         'metformin-pioglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin',
    #         'troglitazone', 'tolbutamide', 'acetohexamide']

    # Update keys based on remaining columns
    all_drug_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed']
    keys = [col for col in all_drug_columns if col in df.columns]
    print(f"Updated drug keys: {keys}")

    df = encode_drug_changes(df, keys)

    # Map IDs
    df = map_admission_and_discharge(df)
    df = map_admission_source(df)

    # Encode categorical columns
    df = encode_categorical_columns(df)
    df = encode_drugs(df, keys)
    df = encode_age(df)

    # Process diagnosis columns
    df = preprocess_diagnosis(df)

    # Save preprocessed data to 'data' folder
    output_path = os.path.join("data", "preprocessed_data.csv")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)  # Create the folder if it doesn't exist
    df.to_csv(output_path, index=False)
    print(f"Preprocessed data saved to: {output_path}")

if __name__ == "__main__":
    main_preprocess_fn()

   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No                   No

In [7]:
# Use the downloaded preprocessed_data.csv here
df_clean = pd.read_csv('data/preprocessed_data.csv')

In [8]:
df_clean.head()

Unnamed: 0,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,...,diabetesMed,readmitted,service_utilization,numchange,level1_diag_1,level2_diag_1,level1_diag_2,level2_diag_2,level1_diag_3,level2_diag_3
0,0,2,1,1,7,3,59,0,18,9,...,1,>30,0,3,276.0,276.0,250.01,250.01,255.0,255.0
1,0,3,1,1,7,2,11,5,13,6,...,1,NO,3,1,648.0,648.0,250.0,250.0,0.0,0.0
2,1,4,1,1,7,2,44,1,16,7,...,1,NO,0,3,8.0,8.0,250.43,250.43,403.0,403.0
3,1,5,1,1,7,1,51,0,8,5,...,1,NO,0,2,197.0,197.0,157.0,157.0,250.0,250.0
4,1,6,1,1,1,3,31,6,16,9,...,1,>30,0,1,414.0,414.0,411.0,411.0,250.0,250.0


#### Test - Train split

In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

X = df_clean.drop('readmitted', axis=1)
y = df_clean['readmitted']
# Map target labels to integers
label_mapping = {'<30': 1, 'NO': 0, '>30': -1}
y = y.map(label_mapping)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


# Apply SMOTE only to the training data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Ensure the data is numeric (if needed)
X_resampled = pd.DataFrame(X_resampled).apply(pd.to_numeric, errors='coerce')
X_resampled = np.array(X_resampled, dtype=float)
y_resampled = np.array(y_resampled, dtype=int)

# Check for NaNs
if np.isnan(X_resampled).any():
    raise ValueError("SMOTE-generated data contains NaN values.")

# Compute mean and standard deviation for each feature in the training set
mean = np.mean(X_resampled, axis=0)
std = np.std(X_resampled, axis=0)

# Avoid division by zero in case of constant features
std[std == 0] = 1

# Scale the training set
X_train_scaled = (X_resampled - mean) / std
X_train_scaled = np.array(X_train_scaled, dtype=float)

# Scale the test set using the same mean and std as the training set
X_test_scaled = (X_test - mean) / std

# Ensure the scaled test data is numeric
X_test_scaled = np.array(X_test_scaled, dtype=float)

X_train_scaled_np = X_train_scaled.astype(np.float64)
y_resampled_np = y_resampled.astype(np.float64)


In [10]:
# Output to check
print("Class distribution in original y_train:")
print(y_train.value_counts())

print("\nClass distribution in resampled y_train:")
print(pd.Series(y_resampled).value_counts())


Class distribution in original y_train:
readmitted
 0    43054
-1    28138
 1     9000
Name: count, dtype: int64

Class distribution in resampled y_train:
 0    43054
-1    43054
 1    43054
Name: count, dtype: int64


### Logistic Regression

In [11]:
# Logistic Regression Implementation
class LogisticRegression:
    def __init__(self, learning_rate=0.01, iterations=1000):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.weights = None
        self.bias = None
        self.classes_ = None

    def sigmoid(self, z):
        z = np.clip(z, -500, 500)  # Clip values to avoid overflow
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.classes_ = np.unique(y)  # Store unique class labels
        if len(self.classes_) > 2:
            raise ValueError("This implementation supports only binary classification. Use one-vs-all for multiclass.")

        m, n = X.shape
        self.weights = np.zeros(n)
        self.bias = 0

        for _ in range(self.iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            predictions = self.sigmoid(linear_model)
            error = predictions - y
            self.weights -= (self.learning_rate / m) * np.dot(X.T, error)
            self.bias -= (self.learning_rate / m) * np.sum(error)

    def predict_proba(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        probabilities = self.sigmoid(linear_model)
        return np.vstack([1 - probabilities, probabilities]).T  # Probability for each class

    def predict(self, X):
        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)  # Class with the highest probability


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def one_vs_all_custom(model_class, X_train, y_train, X_test, y_test):
    """
    Custom implementation of One-vs-All for multi-class classification with evaluation metrics.

    Parameters:
        model_class: A class that implements fit and predict_proba methods.
                     A new instance will be created for each class.
        X_train: Training features (numpy array or similar structure).
        y_train: Training labels (numpy array, one-dimensional).
        X_test: Test features for prediction (numpy array or similar structure).
        y_test: True labels for X_test (numpy array, one-dimensional).

    Returns:
        predictions: Array of predicted class labels for X_test.
        models: List of trained binary classifiers, one for each class.
    """
    classes = np.unique(y_train)  # Get all unique class labels
    print("Classes unique values:", classes)
    models = []  # To store trained models
    scores = np.zeros((X_test.shape[0], len(classes)))  # To store scores for each class

    # Train one model for each class
    for i, c in enumerate(classes):
        # Create binary labels for the current class (1 for current class, 0 otherwise)
        binary_y_train = (y_train == c).astype(int)

        # Train a new instance of the model for the current class
        model = model_class()
        model.fit(X_train, binary_y_train)
        models.append(model)

        # Get probabilities for the current class
        scores[:, i] = model.predict_proba(X_test)[:, 1]  # Take probability for class 1


    # Final predictions are the classes with the highest score
    predictions = np.argmax(scores, axis=1)

    # Calculate evaluation metrics for each class
    print("Evaluation Metrics (per class):")
    for i, c in enumerate(classes):
        binary_y_test = (y_test == c).astype(int)  # Create binary labels for y_test
        binary_predictions = (predictions == i).astype(int)  # Predicted labels for class i

        acc = accuracy_score(binary_y_test, binary_predictions)
        prec = precision_score(binary_y_test, binary_predictions, zero_division=0)
        rec = recall_score(binary_y_test, binary_predictions, zero_division=0)
        f1 = f1_score(binary_y_test, binary_predictions, zero_division=0)

        print(f"Class {c}: Accuracy={acc:.2f}, Precision={prec:.2f}, Recall={rec:.2f}, F1={f1:.2f}")

    return predictions, models

In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_all_classes(y_test, predictions):
    # Compute overall accuracy
    accuracy = accuracy_score(y_test, predictions)

    # Compute precision, recall, F1 for all classes
    precision, recall, f1, support = precision_recall_fscore_support(y_test, predictions, average=None)

    # Display the metrics for each class
    print(f"Overall Accuracy: {accuracy:.2f}")
    print("\nEvaluation Metrics (per class):")
    for i, class_label in enumerate(np.unique(y_test)):
        print(f"Class {class_label}: Precision={precision[i]:.2f}, Recall={recall[i]:.2f}, F1={f1[i]:.2f}, Support={support[i]}")


#### 1. Logistics Regression training - imbalanced data

* The train dataset is not balanced with equal weight for all the classes

In [14]:
# Use one-vs-all with Logistic Regression
predictions, models = one_vs_all_custom(LogisticRegression, X_train_scaled_np, y_resampled_np, X_test_scaled, y_test)

# Convert predictions to the same format as y_test (strings)
unique_classes = np.unique(y_test)  # Get unique class labels in y_test
predictions_str = [unique_classes[pred] for pred in predictions]  # Map the integer predictions to the corresponding class labels

# Now evaluate the classes
evaluate_all_classes(y_test, predictions_str)

Classes unique values: [-1.  0.  1.]
Evaluation Metrics (per class):
Class -1.0: Accuracy=0.63, Precision=0.45, Recall=0.21, F1=0.28
Class 0.0: Accuracy=0.57, Precision=0.59, Recall=0.60, F1=0.60
Class 1.0: Accuracy=0.67, Precision=0.13, Recall=0.35, F1=0.19
Overall Accuracy: 0.43

Evaluation Metrics (per class):
Class -1: Precision=0.45, Recall=0.21, F1=0.28, Support=7035
Class 0: Precision=0.59, Recall=0.60, F1=0.60, Support=10764
Class 1: Precision=0.13, Recall=0.35, F1=0.19, Support=2250


In [15]:
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))
print(np.unique(predictions, return_counts=True))

(array([-1,  0,  1]), array([28138, 43054,  9000]))
(array([-1,  0,  1]), array([ 7035, 10764,  2250]))
(array([0, 1, 2]), array([ 3252, 10885,  5912]))


#### 2. Logistics Regression training - resampled balanced data

* The train dataset is balanced with equal weight for all the classes using SMOTE()

In [16]:
# Use one-vs-all with Logistic Regression
predictions, models = one_vs_all_custom(LogisticRegression, X_train_scaled_np, y_resampled_np, X_test_scaled, y_test)

# Convert predictions to the same format as y_test (strings)
unique_classes = np.unique(y_test)  # Get unique class labels in y_test
predictions_str = [unique_classes[pred] for pred in predictions]  # Map the integer predictions to the corresponding class labels

# Now evaluate the classes
evaluate_all_classes(y_test, predictions_str)

Classes unique values: [-1.  0.  1.]
Evaluation Metrics (per class):
Class -1.0: Accuracy=0.63, Precision=0.45, Recall=0.21, F1=0.28
Class 0.0: Accuracy=0.57, Precision=0.59, Recall=0.60, F1=0.60
Class 1.0: Accuracy=0.67, Precision=0.13, Recall=0.35, F1=0.19
Overall Accuracy: 0.43

Evaluation Metrics (per class):
Class -1: Precision=0.45, Recall=0.21, F1=0.28, Support=7035
Class 0: Precision=0.59, Recall=0.60, F1=0.60, Support=10764
Class 1: Precision=0.13, Recall=0.35, F1=0.19, Support=2250


In [17]:
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))
print(np.unique(predictions, return_counts=True))

(array([-1,  0,  1]), array([28138, 43054,  9000]))
(array([-1,  0,  1]), array([ 7035, 10764,  2250]))
(array([0, 1, 2]), array([ 3252, 10885,  5912]))


### SVM

In [18]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
# from src.one_vs_all_method import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

class HardMarginSVM:
    def __init__(self, learning_rate: float, n_iter: int = 1000):
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.w = None  # Weights, to be initialized during training
        self.b = 0  # Bias term


    def fit(self, X, y):
        """
        Train the SVM model using gradient descent.

        Parameters:
        - X: Input features (numpy array of shape [n_samples, n_features]).
        - y: Target labels (numpy array of shape [n_samples]).
        """
        # Convert to NumPy arrays if not already
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
            X = X.to_numpy()
        if isinstance(y, pd.Series):
            y = y.to_numpy()

        # Ensure numeric types
        X = np.array(X, dtype=np.float64)
        y = np.array(y, dtype=np.int64)
        # print('#################################################')
        # print(f"X_resampled sample: {X[:5]}")
        # print(f"y_resampled sample: {y[:5]}")
        # print('#################################################')

        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)  # Initialize weights
        self.b = 0  # Initialize bias

        # print(f"x_i dtype: {x_i.dtype}, x_i: {x_i}")
        print(f"self.w dtype: {self.w.dtype}, self.w: {self.w}")
        print(f"self.b dtype: {type(self.b)}, self.b: {self.b}")


        # Initialize the progress bar
        for iteration in tqdm(range(self.n_iter), desc="Training Progress", unit="iter"):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.learning_rate * (2 * 1 / n_samples * self.w)
                else:
                    self.w -= self.learning_rate * (2 * 1 / n_samples * self.w - np.dot(x_i, y[idx]))
                    self.b -= self.learning_rate * y[idx]

            # Optionally, print the progress at specific intervals
            if iteration % 100 == 0:
                print(f"Iteration {iteration}/{self.n_iter}")




    def predict(self, X):
        # Decision function to classify based on the sign of the result
        pred = np.dot(X, self.w) + self.b
        return np.sign(pred)

    def decision_function(self, X):
        # Compute the distance from the hyperplane
        return np.dot(X, self.w) + self.b

    def evaluate(self, y_test, y_pred):
        # Calculate accuracy, precision, recall, and F1 score
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)

        # Print the metrics
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1)
        print("Confusion Matrix:")
        print(conf_matrix)

        return accuracy, precision, recall, f1, conf_matrix



    def run_model_svm_hard_margin(self, X_train, X_test, y_train, y_test):
        # Fit the model
        self.fit(X_train, y_train)

        # Predict on test data
        y_pred = self.predict(X_test)

        # Evaluate model performance and return metrics
        return self.evaluate(y_test, y_pred)

In [19]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import numpy as np

import sys
import os


models = {}
predictions = []

# Train a separate model for each class
for target_class in label_mapping.values():
    # Create binary labels for the current class
    binary_y_train = (y_train == target_class).astype(int)

    # print(f"X_resampled dtype: {X_resampled.dtypes}")
    # print(f"y_resampled dtype: {y_resampled.dtypes}")
    # print('#################################################')
    # print(f"X_resampled dtype: {X_resampled.dtypes}, shape: {X_resampled.shape}")
    # print(f"y_resampled dtype: {y_resampled.dtypes}, shape: {y_resampled.shape}")
    # print('#################################################')


    # X_train_scaled_np = X_train_scaled.astype(np.float64)
    # y_resampled_np = y_resampled.astype(np.float64)
    # X_resampled_np = X_resampled.to_numpy(dtype=np.float64)
    # y_resampled_np = y_resampled.to_numpy(dtype=np.int64)

    # Debug info
    # print(f"Converted X_resampled dtype: {X_resampled_np.dtype}")
    # print(f"Converted y_resampled dtype: {y_resampled_np.dtype}")






    # Train the SVM model for the current class
    model = HardMarginSVM(learning_rate=0.01, n_iter=500)

    # Fit the model
    print("Fitting the model for target class", target_class)
    model.fit(X_train_scaled_np, y_resampled_np)
    print("Saving the model for target class", target_class)


    # Save the trained model
    models[target_class] = model

    # Predict scores for the test data
    print("Predicting the scores for target class", target_class)
    scores = model.decision_function(X_test_scaled)
    predictions.append(scores)
    print("Predicting the scores for target class done", target_class)

# # Combine predictions to determine the final class for each test instance
# predictions = np.array(predictions)
# final_predictions = np.argmax(predictions, axis=0) # Assign class with the highest score
# final_predictions = final_predictions.astype(int)  # Convert to integers if needed
# print("Final prediction of classes DONE!!!")
# print(f"y_test type: {type(y_test)}, y_test unique values: {np.unique(y_test)}")
# print(f"final_predictions type: {type(final_predictions)}, final_predictions unique values: {np.unique(final_predictions)}")




predictions = np.array(predictions)

# Ensure that predictions shape is (num_classes, num_samples) for np.argmax
if predictions.shape[0] == len(label_mapping):
    final_predictions = np.argmax(predictions, axis=0)  # Assign class with the highest score
else:
    # Handle case where predictions do not align with number of target classes
    raise ValueError("Mismatch between number of target classes and predictions")

final_predictions = final_predictions.astype(int)  # Convert to integers if needed
final_predictions = np.array(final_predictions).astype(int)
print("Final prediction of classes DONE!!!")
print(f"y_test type: {type(y_test)}, y_test unique values: {np.unique(y_test)}")
print(f"final_predictions type: {type(final_predictions)}, final_predictions unique values: {np.unique(final_predictions)}")
print('************************************************************************************')
print(f"final_predictions: {final_predictions}")


# print('***************** mapped final predictions', final_predictions)
print('************************************************************************************')
print("y_test unique values and counts:", y_test.value_counts())
print('************************************************************************************')
print("final_predictions unique values and counts:", np.unique(final_predictions, return_counts=True))
print('************************************************************************************')
# Evaluate the model
print('Evaluating the model')
accuracy = accuracy_score(y_test, final_predictions)
print('accuracy', accuracy)
precision = precision_score(y_test, final_predictions, average='weighted', zero_division=0)
recall = recall_score(y_test, final_predictions, average='weighted', zero_division=0)
f1 = f1_score(y_test, final_predictions, average='weighted', zero_division=0)
conf_matrix = confusion_matrix(y_test, final_predictions)

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)


Fitting the model for target class 1
self.w dtype: float64, self.w: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
self.b dtype: <class 'int'>, self.b: 0


Training Progress:   0%|                      | 1/500 [00:00<04:18,  1.93iter/s]

Iteration 0/500


Training Progress:  20%|████                | 101/500 [00:51<03:25,  1.94iter/s]

Iteration 100/500


Training Progress:  40%|████████            | 201/500 [01:42<02:34,  1.94iter/s]

Iteration 200/500


Training Progress:  60%|████████████        | 301/500 [02:47<04:11,  1.26s/iter]

Iteration 300/500


Training Progress:  80%|████████████████    | 401/500 [03:39<00:49,  1.99iter/s]

Iteration 400/500


Training Progress: 100%|████████████████████| 500/500 [04:28<00:00,  1.86iter/s]


Saving the model for target class 1
Predicting the scores for target class 1
Predicting the scores for target class done 1
Fitting the model for target class 0
self.w dtype: float64, self.w: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
self.b dtype: <class 'int'>, self.b: 0


Training Progress:   0%|                      | 1/500 [00:00<04:15,  1.96iter/s]

Iteration 0/500


Training Progress:  20%|████                | 101/500 [00:51<03:25,  1.94iter/s]

Iteration 100/500


Training Progress:  40%|████████            | 201/500 [01:42<02:31,  1.97iter/s]

Iteration 200/500


Training Progress:  60%|████████████        | 301/500 [02:32<01:40,  1.98iter/s]

Iteration 300/500


Training Progress:  80%|████████████████    | 401/500 [03:23<00:50,  1.98iter/s]

Iteration 400/500


Training Progress: 100%|████████████████████| 500/500 [04:13<00:00,  1.97iter/s]


Saving the model for target class 0
Predicting the scores for target class 0
Predicting the scores for target class done 0
Fitting the model for target class -1
self.w dtype: float64, self.w: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
self.b dtype: <class 'int'>, self.b: 0


Training Progress:   0%|                      | 1/500 [00:00<04:18,  1.93iter/s]

Iteration 0/500


Training Progress:  20%|████                | 101/500 [00:52<03:22,  1.97iter/s]

Iteration 100/500


Training Progress:  40%|████████            | 201/500 [01:43<02:31,  1.97iter/s]

Iteration 200/500


Training Progress:  60%|████████████        | 301/500 [02:37<01:44,  1.90iter/s]

Iteration 300/500


Training Progress:  80%|████████████████    | 401/500 [03:28<00:49,  1.98iter/s]

Iteration 400/500


Training Progress: 100%|████████████████████| 500/500 [04:18<00:00,  1.93iter/s]

Saving the model for target class -1
Predicting the scores for target class -1
Predicting the scores for target class done -1
Final prediction of classes DONE!!!
y_test type: <class 'pandas.core.series.Series'>, y_test unique values: [-1  0  1]
final_predictions type: <class 'numpy.ndarray'>, final_predictions unique values: [0]
************************************************************************************
final_predictions: [0 0 0 ... 0 0 0]
************************************************************************************
y_test unique values and counts: readmitted
 0    10764
-1     7035
 1     2250
Name: count, dtype: int64
************************************************************************************
final_predictions unique values and counts: (array([0]), array([20049]))
************************************************************************************
Evaluating the model
accuracy 0.5368846326500075
Accuracy: 0.5368846326500075
Precision: 0.2882451087757335
Recal




## Observations:

* Based on the results of the confusion matrix and final predictions, its evident that model is not able to classify all catergories properly. The model is being biased towards the category 'NO'.

* The accuracy we got is close to 54%, which seems to be in a satisfactory range. But based on the precision value of 0.288 and the output of the confusion matrix, the model is not classifying efficiently. Based on further examination we came to umderstand that all the prediction values of the model are 0 i.e., 'NO' category.

### Reasons for such performance:
* The dataset itself is biases towards the category 'NO'
* Many features have low variance in the dataset, leading to not enough information for the model to learn patterns and able to classify the target variable properly.
* Also, training the SVM for less number of epochs, due to lack of computational resources and limited time constraints.

### Further improvements for better results:
* Gather more data that is un-biased and have columns with high entropy values might help the model to identify and learn more insights.
* Leveraging additonal computational resources like powerful GPU's and training the model for huge number of epochs - 1000 or more - might improve the model performance.
* Apart from linear kernel usage of other kernals, such as polynomial or rbf kernels, could provide some better results.