### 1. CART (Classification and Regression Trees) - DecisionTree Classifier
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Encode the target variable (Stage) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")
    
# Split the dataset into training and testing sets
test_size = 0.20
random_seed = 50

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_seed)

# Initialize and train the Decision Tree Classifier with hyperparameters
max_depth = 5  # You can adjust this value
min_samples_split = 2  # You can adjust this value
min_samples_leaf = 1  # You can adjust this value

model = DecisionTreeClassifier(
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_seed
)

model.fit(X_train, Y_train)

# Evaluate the accuracy
accuracy = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (accuracy * 100.0))

# Generate the confusion matrix
confusion_matrix_dt = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_dt)

# Generate the classification report
classification_report_dt = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_dt)


Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Accuracy: 76.190%
Confusion Matrix:
[[41  2  2]
 [ 3  0  0]
 [11  2 23]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.91      0.82        45
           1       0.00      0.00      0.00         3
           2       0.92      0.64      0.75        36

    accuracy                           0.76        84
   macro avg       0.56      0.52      0.52        84
weighted avg       0.79      0.76      0.76        84



### 2. Gaussian Naive Bayes
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
  print(f"Label: {label}, Numerical Value: {value}")

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a StandardScaler object to standardize the features
scaler = StandardScaler()

# Standardize the training data
X_train_scaled = scaler.fit_transform(X_train)

# Standardize the testing data
X_test_scaled = scaler.transform(X_test)

# Create a Gaussian Naive Bayes classifier
model = GaussianNB(priors=None, var_smoothing=1e-9)  # Hyperparameters: priors and var_smoothing

# Train the model on the standardized training data
model.fit(X_train_scaled, Y_train)

# Evaluate the accuracy
result = model.score(X_test_scaled, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))

# Generate the confusion matrix and classification report
confusion_matrix_nb = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_nb)
classification_report_nb = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_nb)




Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Accuracy: 73.810%
Confusion Matrix:
[[46  0  1]
 [ 2  0  0]
 [33  0  2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.98      0.72        47
           1       0.00      0.00      0.00         2
           2       0.67      0.06      0.11        35

    accuracy                           0.57        84
   macro avg       0.41      0.35      0.27        84
weighted avg       0.60      0.57      0.45        84



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 3. Gradient Boosting Machines (AdaBoost)

In [43]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create an AdaBoost classifier
model = AdaBoostClassifier(n_estimators=50, random_state=seed)
# Hyperparameters:
# - n_estimators: The number of weak classifiers (base estimators) to train. You can adjust this to control the complexity of the ensemble.
# - random_state: The random seed for reproducibility. You can set this to a specific value if you want consistent results.

# Train the model on the training data
model.fit(X_train, Y_train)

# Evaluate the accuracy
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))


from sklearn.metrics import confusion_matrix, classification_report
# Generate the confusion matrix and classification report
confusion_matrix_nb = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_nb)
classification_report_nb = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_nb)

Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Accuracy: 83.333%
Confusion Matrix:
[[42  1  4]
 [ 2  0  0]
 [ 6  1 28]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.87        47
           1       0.00      0.00      0.00         2
           2       0.88      0.80      0.84        35

    accuracy                           0.83        84
   macro avg       0.57      0.56      0.57        84
weighted avg       0.83      0.83      0.83        84



### 4. K-Nearest Neighbors (K-NN)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [44]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a K-Nearest Neighbors (K-NN) classifier
model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')
# Hyperparameters:
# - n_neighbors: The number of nearest neighbors to consider when making predictions. You can adjust this to control the model's sensitivity to local patterns.
# - weights: Determines how the neighbors' contributions are weighted (e.g., 'uniform' or 'distance'). You can choose the appropriate weighting strategy.
# - algorithm: The algorithm used to compute the nearest neighbors ('auto', 'ball_tree', 'kd_tree', or 'brute'). You can choose the most suitable algorithm based on your data size and structure.

# Train the model on the training data
model.fit(X_train, Y_train)

# Evaluate the accuracy
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))

from sklearn.metrics import confusion_matrix, classification_report
# Generate the confusion matrix and classification report
confusion_matrix_nb = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_nb)
classification_report_nb = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_nb)

Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Accuracy: 71.429%
Confusion Matrix:
[[41  0  6]
 [ 2  0  0]
 [16  0 19]]
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.87      0.77        47
           1       0.00      0.00      0.00         2
           2       0.76      0.54      0.63        35

    accuracy                           0.71        84
   macro avg       0.48      0.47      0.47        84
weighted avg       0.71      0.71      0.70        84



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 5. Logistic Regression
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [45]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")
# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a Logistic Regression model
model = LogisticRegression(max_iter=300, solver='lbfgs', C=1.0)
# Hyperparameters:
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - solver: The algorithm to use for optimization ('lbfgs', 'liblinear', etc.). Choose an appropriate solver for your data and problem.
# - C: Inverse of regularization strength. Smaller values increase regularization. You can adjust this to control the trade-off between fitting the data and preventing overfitting.

# Train the model on the training data
model.fit(X_train, Y_train)

# Evaluate the accuracy
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))

from sklearn.metrics import confusion_matrix, classification_report
# Generate the confusion matrix and classification report
confusion_matrix_nb = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_nb)
classification_report_nb = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_nb)

Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Accuracy: 78.571%
Confusion Matrix:
[[41  0  6]
 [ 2  0  0]
 [10  0 25]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.87      0.82        47
           1       0.00      0.00      0.00         2
           2       0.81      0.71      0.76        35

    accuracy                           0.79        84
   macro avg       0.53      0.53      0.53        84
weighted avg       0.77      0.79      0.77        84



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. Multi-Layer Perceptron (MLP)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy


In [54]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create an MLP-based model
model = MLPClassifier(hidden_layer_sizes=(65, 32), activation='relu', solver='adam', max_iter=200, random_state=seed)
# Hyperparameters:
# - hidden_layer_sizes: The number of neurons in each hidden layer. You can customize the architecture by adjusting this parameter.
# - activation: The activation function used in the hidden layers ('relu', 'tanh', etc.). Choose the appropriate one for your problem.
# - solver: The algorithm for weight optimization ('adam', 'lbfgs', etc.). Select the one that works best for your data.
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Train the model
model.fit(X_train, Y_train)

# Evaluate the accuracy
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))

from sklearn.metrics import confusion_matrix, classification_report
# Generate the confusion matrix and classification report
confusion_matrix_nb = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_nb)
classification_report_nb = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_nb)


Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Accuracy: 73.810%
Confusion Matrix:
[[41  0  6]
 [ 2  0  0]
 [14  0 21]]
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.87      0.79        47
           1       0.00      0.00      0.00         2
           2       0.78      0.60      0.68        35

    accuracy                           0.74        84
   macro avg       0.50      0.49      0.49        84
weighted avg       0.73      0.74      0.72        84



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 7. Perceptron
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [52]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a Perceptron classifier
model = Perceptron(max_iter=200, random_state=seed, eta0=1.0, tol=1e-3)
# Hyperparameters:
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.
# - eta0: The initial learning rate. You can control the step size for weight updates by adjusting this.
# - tol: The tolerance for stopping criterion. The model will stop training when the change in the average loss is smaller than this value.

# Train the model
model.fit(X_train, Y_train)

# Evaluate the accuracy
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))

from sklearn.metrics import confusion_matrix, classification_report
# Generate the confusion matrix and classification report
confusion_matrix_nb = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_nb)
classification_report_nb = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_nb)

Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Accuracy: 44.048%
Confusion Matrix:
[[ 3  0 44]
 [ 0  0  2]
 [ 1  0 34]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.06      0.12        47
           1       0.00      0.00      0.00         2
           2       0.42      0.97      0.59        35

    accuracy                           0.44        84
   macro avg       0.39      0.35      0.24        84
weighted avg       0.60      0.44      0.31        84



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 8. Random Forest
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [48]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a Random Forest classifier
rfmodel = RandomForestClassifier(n_estimators=100, random_state=seed, max_depth=None, min_samples_split=2, min_samples_leaf=1)
# Hyperparameters:
# - n_estimators: The number of decision trees in the random forest. Adjust this to control the ensemble size.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.
# - max_depth: The maximum depth of the decision trees. You can limit tree depth to prevent overfitting.
# - min_samples_split: The minimum number of samples required to split a node. Adjust this to control tree node splitting.
# - min_samples_leaf: The minimum number of samples required in a leaf node. You can adjust this to control tree leaf size.

# Train the model
rfmodel.fit(X_train, Y_train)

# Evaluate the accuracy
result = rfmodel.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))

from sklearn.metrics import confusion_matrix, classification_report
# Generate the confusion matrix and classification report
confusion_matrix_nb = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_nb)
classification_report_nb = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_nb)

Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Accuracy: 78.571%
Confusion Matrix:
[[ 3  0 44]
 [ 0  0  2]
 [ 1  0 34]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.06      0.12        47
           1       0.00      0.00      0.00         2
           2       0.42      0.97      0.59        35

    accuracy                           0.44        84
   macro avg       0.39      0.35      0.24        84
weighted avg       0.60      0.44      0.31        84



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 9. Support Vector Machines (SVM)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Accuracy

In [49]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Load the dataset
filename = './cirrhosis.csv'
dataframe = pd.read_csv(filename)

# Separate features and target variable
X = dataframe.drop('Status', axis=1)
Y = dataframe['Status']

# Impute missing values with the mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

# Encode the target variable (Status) using label encoding
le = LabelEncoder()
Y = le.fit_transform(Y)

# Display the mapping of labels to numerical values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for label, value in label_mapping.items():
    print(f"Label: {label}, Numerical Value: {value}")

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create an SVM classifier
model = SVC(kernel='linear', C=1.0, random_state=seed)
# Hyperparameters:
# - kernel: The type of kernel to use ('linear', 'poly', 'rbf', etc.). Choose the appropriate kernel for your problem.
# - C: The regularization parameter. Smaller values increase regularization. You can adjust this to control the trade-off between fitting the data and preventing overfitting.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Train the model
model.fit(X_train, Y_train)

# Evaluate the accuracy
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))

from sklearn.metrics import confusion_matrix, classification_report
# Generate the confusion matrix and classification report
confusion_matrix_nb = confusion_matrix(Y_test, model.predict(X_test))
print("Confusion Matrix:")
print(confusion_matrix_nb)
classification_report_nb = classification_report(Y_test, model.predict(X_test))
print("Classification Report:")
print(classification_report_nb)

Label Mapping:
Label: C, Numerical Value: 0
Label: CL, Numerical Value: 1
Label: D, Numerical Value: 2
Accuracy: 78.571%
Confusion Matrix:
[[41  1  5]
 [ 1  1  0]
 [10  1 24]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.87      0.83        47
           1       0.33      0.50      0.40         2
           2       0.83      0.69      0.75        35

    accuracy                           0.79        84
   macro avg       0.65      0.69      0.66        84
weighted avg       0.79      0.79      0.79        84

