### 1. CART (Classification and Regression Trees) - DecisionTree Classifier
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [4]:
import pandas as pd
from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, confusion_matrix, classification_report

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Initialize the Decision Tree Classifier with hyperparameters
max_depth = 5  # You can adjust this value
min_samples_split = 2  # You can adjust this value
min_samples_leaf = 1  # You can adjust this value

model = DecisionTreeClassifier(
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf
)

# Repeated Random Train-Test splits using RepeatedKFold
n_splits = 5  # Number of splits
n_repeats = 10  # Number of repeats
random_seed = 50

rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_seed)

log_losses = []

for train_index, test_index in rkf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    model.fit(X_train, Y_train)

    # Evaluate the log loss
    y_pred_proba = model.predict_proba(X_test)
    log_loss_value = log_loss(Y_test, y_pred_proba)
    log_losses.append(log_loss_value)

# Calculate and print the mean log loss
mean_log_loss = sum(log_losses) / len(log_losses)
print("Mean Logarithmic Loss: %.3f" % mean_log_loss)

# Generate the confusion matrix for the last split
confusion_matrix_dt = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_dt)

# Generate the classification report for the last split
classification_report_dt = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_dt)


Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Mean Logarithmic Loss: 2.995
Confusion Matrix:
[[48  1  7]
 [ 3  0  2]
 [ 8  0 14]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        56
           1       0.00      0.00      0.00         5
           2       0.61      0.64      0.62        22

    accuracy                           0.75        83
   macro avg       0.47      0.50      0.49        83
weighted avg       0.71      0.75      0.73        83



### 2. Gaussian Naive Bayes
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Repeated Random Train-Test splits using RepeatedKFold
n_splits = 5  # Number of splits
n_repeats = 10  # Number of repeats
seed = 7

rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

log_losses = []

# Standardize the features before using Naive Bayes
scaler = StandardScaler()

for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Standardize the training and testing data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Create a Gaussian Naive Bayes classifier
    model = GaussianNB(priors=None, var_smoothing=1e-9)

    # Calibrate the classifier
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid')
    calibrated_model.fit(X_train_scaled, Y_train)

    # Evaluate the log loss
    y_pred_proba = calibrated_model.predict_proba(X_test_scaled)
    log_loss_value = log_loss(Y_test, y_pred_proba)
    log_losses.append(log_loss_value)

# Calculate and print the mean log loss
mean_log_loss = sum(log_losses) / len(log_losses)
print("Mean Logarithmic Loss: %.3f" % mean_log_loss)

# Generate the confusion matrix for the last split
confusion_matrix_nb = confusion_matrix(Y_test, calibrated_model.predict(X_test_scaled))
print("Confusion Matrix:")
print(confusion_matrix_nb)

# Generate the classification report for the last split
classification_report_nb = classification_report(Y_test, calibrated_model.predict(X_test_scaled), zero_division=1)
print("Classification Report:")
print(classification_report_nb)


Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Mean Logarithmic Loss: 0.703
Confusion Matrix:
[[40  0  4]
 [ 6  0  1]
 [14  0 18]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.91      0.77        44
           1       1.00      0.00      0.00         7
           2       0.78      0.56      0.65        32

    accuracy                           0.70        83
   macro avg       0.82      0.49      0.47        83
weighted avg       0.74      0.70      0.66        83



### 3. Gradient Boosting Machines (AdaBoost)

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, confusion_matrix, classification_report

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Repeated Random Train-Test splits using RepeatedKFold
n_splits = 5  # Number of splits
n_repeats = 10  # Number of repeats
seed = 7

rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

log_losses = []

for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Create an AdaBoost classifier
    model = AdaBoostClassifier(n_estimators=50, random_state=seed)

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Evaluate the log loss
    y_pred_proba = model.predict_proba(X_test)
    log_loss_value = log_loss(Y_test, y_pred_proba)
    log_losses.append(log_loss_value)

# Calculate and print the mean log loss
mean_log_loss = sum(log_losses) / len(log_losses)
print("Mean Logarithmic Loss: %.3f" % mean_log_loss)

# Generate the confusion matrix for the last split
confusion_matrix_ab = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_ab)

# Generate the classification report for the last split
classification_report_ab = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_ab)


Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Mean Logarithmic Loss: 0.858
Confusion Matrix:
[[35  1  8]
 [ 2  4  1]
 [ 7  0 25]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        44
           1       0.80      0.57      0.67         7
           2       0.74      0.78      0.76        32

    accuracy                           0.77        83
   macro avg       0.78      0.72      0.74        83
weighted avg       0.77      0.77      0.77        83



### 4. K-Nearest Neighbors (K-NN)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, confusion_matrix, classification_report

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Repeated Random Train-Test splits using RepeatedKFold
n_splits = 5  # Number of splits
n_repeats = 10  # Number of repeats
seed = 7

rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

log_losses = []

for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Create a K-Nearest Neighbors (K-NN) classifier
    model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Evaluate the log loss
    y_pred_proba = model.predict_proba(X_test)
    log_loss_value = log_loss(Y_test, y_pred_proba)
    log_losses.append(log_loss_value)

# Calculate and print the mean log loss
mean_log_loss = sum(log_losses) / len(log_losses)
print("Mean Logarithmic Loss: %.3f" % mean_log_loss)

# Generate the confusion matrix for the last split
confusion_matrix_knn = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_knn)

# Generate the classification report for the last split with zero_division parameter
classification_report_knn = classification_report(Y_test, model.predict(X_test), zero_division=1)
print("Classification Report:")
print(classification_report_knn)


Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Mean Logarithmic Loss: 3.497
Confusion Matrix:
[[36  0  8]
 [ 7  0  0]
 [15  0 17]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.82      0.71        44
           1       1.00      0.00      0.00         7
           2       0.68      0.53      0.60        32

    accuracy                           0.64        83
   macro avg       0.77      0.45      0.43        83
weighted avg       0.68      0.64      0.60        83



### 5. Logistic Regression
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import log_loss, confusion_matrix, classification_report

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Repeated Random Train-Test splits using RepeatedKFold
n_splits = 5  # Number of splits
n_repeats = 10  # Number of repeats
seed = 7

rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

log_losses = []

for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Create a Logistic Regression model
    model = LogisticRegression(max_iter=1000, solver='lbfgs', C=1.0)

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Evaluate the log loss
    y_pred_proba = model.predict_proba(X_test)
    log_loss_value = log_loss(Y_test, y_pred_proba)
    log_losses.append(log_loss_value)

# Calculate and print the mean log loss
mean_log_loss = sum(log_losses) / len(log_losses)
print("Mean Logarithmic Loss: %.3f" % mean_log_loss)

# Generate the confusion matrix for the last split
confusion_matrix_lr = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_lr)

# Generate the classification report for the last split
classification_report_lr = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_lr)


Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Mean Logarithmic Loss: 0.563
Confusion Matrix:
[[41  0  3]
 [ 6  1  0]
 [10  1 21]]
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.93      0.81        44
           1       0.50      0.14      0.22         7
           2       0.88      0.66      0.75        32

    accuracy                           0.76        83
   macro avg       0.70      0.58      0.59        83
weighted avg       0.76      0.76      0.74        83



### 6. Multi-Layer Perceptron (MLP)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy


In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, confusion_matrix, classification_report

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Repeated Random Train-Test splits using RepeatedKFold
n_splits = 5  # Number of splits
n_repeats = 10  # Number of repeats
seed = 7

rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

log_losses = []

for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Create an MLP-based model
    model = MLPClassifier(hidden_layer_sizes=(65, 32), activation='relu', solver='adam', max_iter=200, random_state=seed)

    # Train the model
    model.fit(X_train, Y_train)

    # Evaluate the log loss
    y_pred_proba = model.predict_proba(X_test)
    log_loss_value = log_loss(Y_test, y_pred_proba)
    log_losses.append(log_loss_value)

# Calculate and print the mean log loss
mean_log_loss = sum(log_losses) / len(log_losses)
print("Mean Logarithmic Loss: %.3f" % mean_log_loss)

# Generate the confusion matrix for the last split
confusion_matrix_mlp = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_mlp)

# Generate the classification report for the last split
classification_report_mlp = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_mlp)


Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Mean Logarithmic Loss: 7.086
Confusion Matrix:
[[10  5 29]
 [ 1  1  5]
 [ 3  1 28]]
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.23      0.34        44
           1       0.14      0.14      0.14         7
           2       0.45      0.88      0.60        32

    accuracy                           0.47        83
   macro avg       0.44      0.42      0.36        83
weighted avg       0.56      0.47      0.42        83



### 7. Perceptron
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Perceptron
from sklearn.calibration import CalibratedClassifierCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, confusion_matrix, classification_report
from sklearn.utils.validation import check_is_fitted

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Repeated Random Train-Test splits using RepeatedKFold
n_splits = 5  # Number of splits
n_repeats = 10  # Number of repeats
seed = 7

rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

log_losses = []

for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Create a Perceptron classifier
    model = Perceptron(max_iter=200, random_state=seed, eta0=1.0, tol=1e-3)

    # Fit the Perceptron model
    model.fit(X_train, Y_train)

    # Calibrate the classifier to get probability estimates
    calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv='prefit')

    # Check if the model is fitted
    check_is_fitted(model, attributes=["coef_", "intercept_"])

    calibrated_model.fit(X_train, Y_train)

    # Evaluate the log loss
    y_pred_proba = calibrated_model.predict_proba(X_test)
    log_loss_value = log_loss(Y_test, y_pred_proba)
    log_losses.append(log_loss_value)

# Calculate and print the mean log loss
mean_log_loss = sum(log_losses) / len(log_losses)
print("Mean Logarithmic Loss: %.3f" % mean_log_loss)

# Generate the confusion matrix for the last split
confusion_matrix_perceptron = confusion_matrix(Y_test, calibrated_model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_perceptron)

# Generate the classification report for the last split
classification_report_perceptron = classification_report(Y_test, calibrated_model.predict(X_test), zero_division='warn')
print("Classification Report:")
print(classification_report_perceptron)



Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Mean Logarithmic Loss: 0.700
Confusion Matrix:
[[40  0  4]
 [ 3  0  4]
 [12  0 20]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.91      0.81        44
           1       0.00      0.00      0.00         7
           2       0.71      0.62      0.67        32

    accuracy                           0.72        83
   macro avg       0.48      0.51      0.49        83
weighted avg       0.66      0.72      0.69        83



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 8. Random Forest
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
n_splits = 5      # Number of splits for repeated random train-test splits
n_repeats = 3     # Number of times to repeat the cross-validation process

# Initialize RepeatedKFold
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

# Create a Random Forest classifier
rfmodel = RandomForestClassifier(n_estimators=100, random_state=seed, max_depth=None, min_samples_split=2, min_samples_leaf=1)

# Initialize lists to store log loss values
log_loss_values = []

# Perform repeated random train-test splits
for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model
    rfmodel.fit(X_train, Y_train)

    # Predict probabilities for log loss calculation
    Y_probabilities = rfmodel.predict_proba(X_test)

    # Calculate log loss
    loss = log_loss(Y_test, Y_probabilities)
    log_loss_values.append(loss)

# Calculate mean log loss
mean_log_loss = np.mean(log_loss_values)
print("Mean Logarithmic Loss: %.5f" % mean_log_loss)


Mean Logarithmic Loss: 0.63732


### 9. Support Vector Machines (SVM)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import log_loss, classification_report
from sklearn.exceptions import ConvergenceWarning
import warnings

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
n_splits = 5      # Number of splits for repeated random train-test splits
n_repeats = 3     # Number of times to repeat the cross-validation process
seed = 7

# Initialize RepeatedKFold
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

# Create an SVM classifier
model = SVC(kernel='linear', C=0.1, probability=True, random_state=seed, max_iter=10000)  # Increase max_iter further

# Initialize lists to store log loss values and classification reports
log_loss_values = []
classification_reports = []

# Suppress ConvergenceWarnings for now
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Perform repeated random train-test splits
for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train the model
    model.fit(X_train, Y_train)

    # Predict probabilities for log loss calculation
    Y_probabilities = model.predict_proba(X_test)

    # Calculate log loss
    loss = log_loss(Y_test, Y_probabilities)
    log_loss_values.append(loss)

    # Generate classification report
    report = classification_report(Y_test, model.predict(X_test), zero_division=1)  # Adjust zero_division as needed
    classification_reports.append(report)

# Reset warning filters
warnings.resetwarnings()

# Calculate mean log loss
mean_log_loss = np.mean(log_loss_values)
print("Mean Logarithmic Loss: %.5f" % mean_log_loss)

# Print the classification reports for each split
for idx, report in enumerate(classification_reports):
    print(f"\nClassification Report for Split {idx + 1}:\n{report}")


Mean Logarithmic Loss: 0.56745

Classification Report for Split 1:
              precision    recall  f1-score   support

           0       0.77      0.94      0.85        47
           1       0.00      0.00      1.00         2
           2       0.92      0.69      0.79        35

    accuracy                           0.81        84
   macro avg       0.57      0.54      0.88        84
weighted avg       0.82      0.81      0.83        84


Classification Report for Split 2:
              precision    recall  f1-score   support

           0       0.68      1.00      0.81        42
           1       1.00      0.00      0.00        12
           2       0.86      0.63      0.73        30

    accuracy                           0.73        84
   macro avg       0.85      0.54      0.51        84
weighted avg       0.79      0.73      0.66        84


Classification Report for Split 3:
              precision    recall  f1-score   support

           0       0.79      0.94      0.86 