# Data Overview

In [None]:
import numpy as np
import matplotlib
import seaborn as sns 
import pandas as pd

In [None]:
df = pd.read_csv("/kaggle/input/parkinsons-disease-dataset-analysis/parkinsons_disease_data.csv")

In [None]:
# Get the statistical summary of the numerical features
df.describe()

In [None]:
#check the present of missing value
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

# Data Preprocessing

In [None]:
df.duplicated().sum()

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
df.drop(columns=["PatientID"], inplace = True )

# Exploratary Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt

# Selecting columns of interest for correlation visualization
columns_of_interest = ['Age','AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'Diagnosis']
df_subset = df[columns_of_interest]

# Calculate correlation matrix
corr_matrix = df_subset.corr()

# Plotting correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Selected Features')
plt.show()

In [None]:
df.head(1)

In [None]:
# List of discrete columns
discrete_columns = [
    'Gender', 'Ethnicity', 'EducationLevel', 'Smoking', 'FamilyHistoryParkinsons',
    'TraumaticBrainInjury', 'Hypertension', 'Diabetes', 'Depression', 'Stroke',
    'Tremor', 'Rigidity', 'Bradykinesia', 'PosturalInstability', 'SpeechProblems',
    'SleepDisorders', 'Constipation', 'Diagnosis'
]

colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan']

# Bar plots for discrete columns
fig, axs = plt.subplots(nrows=len(discrete_columns), figsize=(10, 40))

for i, col in enumerate(discrete_columns):
    counts = df[col].value_counts(normalize=True) * 100  # Normalize counts to get percentages
    bar_colors = [colors[j % len(colors)] for j in range(len(counts))]  # Different color for each bar
    axs[i].bar(counts.index.astype(str), counts.values, color=bar_colors)
    axs[i].set_title(f'{col} Distribution')
    axs[i].set_ylabel('Percentage')
    axs[i].set_xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=len(discrete_columns), figsize=(10, 40))

for i, col in enumerate(discrete_columns):
    counts = df[col].value_counts()
    axs[i].pie(counts, labels=counts.index.astype(str), autopct='%1.1f%%')
    axs[i].set_title(f'{col} Distribution')

plt.tight_layout()
plt.show()

# Understand The Outliers 

In [None]:
columns = [
    'Age',
    'BMI',
    'AlcoholConsumption',
    'PhysicalActivity', 
    'DietQuality',
    'SleepQuality',
    'SystolicBP',
    'DiastolicBP',
    'CholesterolTotal',
    'CholesterolLDL',
    'CholesterolHDL'
    ,'CholesterolTriglycerides',
    'UPDRS',
    'MoCA',
    'FunctionalAssessment'
]

# Create a copy of the subset of the DataFrame
subset_df = df[columns + ['Diagnosis']].copy()  # This line correctly includes 'Diagnosis' column

# Create boxplots for each column grouped by Diagnosis
fig, axs = plt.subplots(nrows=len(columns), figsize=(10, 60))

for i, col in enumerate(columns):
    sns.boxplot(x = 'Diagnosis', y=col, data=subset_df, ax=axs[i])
    axs[i].set_title(f'{col} by Diagnosis')
    axs[i].set_xlabel('Diagnosis')
    axs[i].set_ylabel(col)

plt.tight_layout()
plt.show()

In [None]:
# Create a copy of the subset of the DataFrame
subset_df = df[columns + ['Diagnosis']].copy()

# Create a pair plot for the selected columns
plt.figure(figsize=(20, 40))
sns.pairplot(subset_df, hue = 'Diagnosis', height=6, plot_kws={'s': 50})
plt.show()

In [None]:
# Create histograms for each column
fig, axs = plt.subplots(nrows=len(columns), figsize=(40, 80))

for i, col in enumerate(columns):
    axs[i].hist(subset_df[col], bins=10, alpha=0.75)
    axs[i].set_title(f'{col} Histogram')
    axs[i].set_xlabel(col)
    axs[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
import scipy.stats as stats

# Visualize with a histogram and a Q-Q plot
for col in columns:
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'{col} Histogram')

    plt.subplot(1, 2, 2)
    stats.probplot(df[col], dist="norm", plot=plt)
    plt.title(f'{col} Q-Q Plot')

    plt.tight_layout()
    plt.show()

In [None]:
dfs = pd.DataFrame(subset_df)

# Convert columns to numeric (if necessary)
dfs = dfs.apply(pd.to_numeric, errors='coerce')

# Calculate skewness coefficients for each numeric column
skewness_results = {}

for col in dfs.select_dtypes(include=[np.number]).columns:
    iqr = dfs[col].quantile(0.75) - dfs[col].quantile(0.25)
    if iqr == 0:
        # Skip columns where IQR is zero
        continue
    
    median_skewness = stats.median_abs_deviation(dfs[col]) / iqr
    fisher_skewness = stats.skew(dfs[col])
    skewness_results[col] = {'Median Skewness': median_skewness, 'Fisher Skewness': fisher_skewness}

# Print results
for col, results in skewness_results.items():
    print(f"Column: {col}")
    print(f"Pearson's Second Skewness Coefficient (Median Skewness): {results['Median Skewness']:.4f}")
    print(f"Fisher's Skewness Coefficient: {results['Fisher Skewness']:.4f}")
    print()

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import shapiro, levene,kruskal

# Assuming `df` is your DataFrame
columns = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
    'SleepQuality', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 
    'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 
    'UPDRS', 'MoCA', 'FunctionalAssessment'
]

discrete_columns = [
    'Gender', 'Ethnicity', 'EducationLevel', 'Smoking', 'FamilyHistoryParkinsons',
    'TraumaticBrainInjury', 'Hypertension', 'Diabetes', 'Depression', 'Stroke',
    'Tremor', 'Rigidity', 'Bradykinesia', 'PosturalInstability', 'SpeechProblems',
    'SleepDisorders', 'Constipation', 'Diagnosis'
]

# One-Way ANOVA
formula = 'Age ~ C(Diagnosis)'
model = smf.ols(formula, data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("One-Way ANOVA Results:")
print(anova_table)

# Two-Way ANOVA Example
formula = 'Age ~ C(Diagnosis) * C(Gender)'
model = smf.ols(formula, data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

# Residuals
residuals = model.resid

# Randomness
plt.figure(figsize=(10, 5))
plt.plot(residuals)
plt.title('Residuals vs. Order of Data')
plt.xlabel('Order')
plt.ylabel('Residuals')
plt.show()

# Normality
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title('Histogram of Residuals')
plt.show()

# QQ Plot
sm.qqplot(residuals, line='s')
plt.title('QQ Plot of Residuals')
plt.show()

In [None]:
# Shapiro-Wilk Test for normality
shapiro_test = shapiro(residuals)
print(f'Shapiro-Wilk Test: W={shapiro_test[0]}, p-value={shapiro_test[1]}')

# Homoscedasticity
# Plot residuals vs. fitted values
fitted_values = model.fittedvalues
plt.figure(figsize=(10, 5))
plt.scatter(fitted_values, residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs. Fitted Values')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.show()

# Levene's Test for homogeneity of variances
group1 = residuals[df['Diagnosis'] == 0]
group2 = residuals[df['Diagnosis'] == 1]
levene_test = levene(group1, group2)
print(f'Levene’s Test: W={levene_test[0]}, p-value={levene_test[1]}')

In [None]:
# Kruskal-Wallis Test (for independent groups)
kruskal_test = kruskal(df[df['Diagnosis'] == 0]['Age'], df[df['Diagnosis'] == 1]['Age'])
print(f'Kruskal-Wallis Test: H={kruskal_test.statistic}, p-value={kruskal_test.pvalue}')

# MODELING

# RANDOM FOREST

In [None]:
# Import necessary libraries for preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, log_loss, precision_recall_curve, f1_score, recall_score
import seaborn as sns


# Define nominal, ordinal, and continuous columns
nominal_columns = [
    'Gender', 'Smoking', 'FamilyHistoryParkinsons',
    'TraumaticBrainInjury', 'Hypertension', 'Diabetes', 'Depression', 'Stroke',
    'Tremor', 'Rigidity', 'Bradykinesia', 'PosturalInstability', 'SpeechProblems',
    'SleepDisorders', 'Constipation'
]

ordinal_columns = [
    'Ethnicity', 'EducationLevel'
]

continuous_columns = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
    'SleepQuality', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 
    'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 
    'UPDRS', 'MoCA', 'FunctionalAssessment'
]

# Define the target column
target_column = 'Diagnosis'

# Preprocessing pipeline for continuous features
continuous_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for nominal features
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for ordinal features
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', continuous_transformer, continuous_columns),
        ('nom', nominal_transformer, nominal_columns),
        ('ord', ordinal_transformer, ordinal_columns)
    ])

# Define the train_and_evaluate_random_forest function with preprocessing
def train_and_evaluate_random_forest(df, continuous_columns, nominal_columns, ordinal_columns, target_column):
    # Features (X)
    X = df[continuous_columns + nominal_columns + ordinal_columns]

    # Target labels (y)
    y = df[target_column]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the Random Forest classifier
    clf = RandomForestClassifier()

    # Define the parameter grid for Grid Search
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create a pipeline that includes preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])

    # Perform Grid Search with cross-validation
    grid_search = GridSearchCV(estimator=pipeline, param_grid={
        'classifier__' + k: v for k, v in param_grid.items()}, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"Best parameters for RandomForest:", best_params)

    # Train the Random Forest classifier on the training data with the best parameters
    pipeline.set_params(**best_params)
    pipeline.fit(X_train, y_train)

    # Evaluate the classifier's performance on the testing data
    test_accuracy = pipeline.score(X_test, y_test)
    print(f"Accuracy for {target_column} label using RandomForest classifier (Test Accuracy):", test_accuracy)

    # Compute confusion matrix
    y_pred = pipeline.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=pipeline.classes_, yticklabels=pipeline.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    # Compute log loss
    y_pred_proba = pipeline.predict_proba(X_test)
    loss = log_loss(y_test, y_pred_proba)
    print(f"Log Loss for {target_column} label using RandomForest classifier:", loss)

    # Compute precision-recall curve and F1 score for each class (if binary classification)
    if len(pipeline.classes_) == 2:
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba[:, 1])
        f1 = f1_score(y_test, y_pred)
        plot_precision_recall_curve(precision, recall)
        print(f"F1 Score for {target_column} label using RandomForest classifier:", f1)

    # Extract feature importances if applicable
    if hasattr(pipeline.named_steps['classifier'], 'feature_importances_'):
        feature_importances = pipeline.named_steps['classifier'].feature_importances_

        # Create a DataFrame to store feature importances
        importance_df = pd.DataFrame({'Feature': preprocessor.get_feature_names_out(), 'Importance': feature_importances})

        # Sort the DataFrame by importance in descending order
        importance_df = importance_df.sort_values(by='Importance', ascending=False)

        # Plot feature importance
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=importance_df)
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.title(f'Feature Importance for {target_column} label using RandomForest')
        plt.show()

    # Store results in a dictionary
    results = {
        'test_accuracy': test_accuracy,
        'loss': loss
    }

    return results  # Return the results dictionary

# Cell 2: Define plot_precision_recall_curve function
def plot_precision_recall_curve(precision, recall):
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, marker='o', color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.grid(True)
    plt.show()

# Cell 3: Load your dataset (`df`) and call the train_and_evaluate_random_forest function

# Assuming `df` is your DataFrame containing the dataset
# Replace `df` with your actual dataset

# Call the function and store the results in results_random_forest
results_random_forest = train_and_evaluate_random_forest(df, continuous_columns, nominal_columns, ordinal_columns, target_column)

# Display or use results_random_forest as needed


# DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

def train_and_evaluate_decision_tree(df, continuous_columns, nominal_columns, ordinal_columns, target_column):
    # Features (X)
    X = df[continuous_columns + nominal_columns + ordinal_columns]

    # Target labels (y)
    y = df[target_column]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the Decision Tree classifier
    clf = DecisionTreeClassifier()

    # Define the parameter grid for Grid Search
    param_grid = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create a pipeline that includes preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])

    # Perform Grid Search with cross-validation
    grid_search = GridSearchCV(estimator=pipeline, param_grid={
        'classifier__' + k: v for k, v in param_grid.items()}, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"Best parameters for Decision Tree:", best_params)

    # Train the Decision Tree classifier on the training data with the best parameters
    pipeline.set_params(**best_params)
    pipeline.fit(X_train, y_train)

    # Evaluate the classifier's performance on the testing data
    test_accuracy = pipeline.score(X_test, y_test)
    print(f"Accuracy for {target_column} label using Decision Tree classifier (Test Accuracy):", test_accuracy)

    # Compute confusion matrix
    y_pred = pipeline.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=pipeline.classes_, yticklabels=pipeline.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    # Compute log loss (not applicable for Decision Trees without probability estimates)
    # Log loss can be omitted for Decision Trees since they do not have predict_proba() method

    # Compute precision and F1 score
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Precision for {target_column} label using Decision Tree classifier:", precision)
    print(f"F1 Score for {target_column} label using Decision Tree classifier:", f1)

    # Extract feature importances if applicable
    if hasattr(pipeline.named_steps['classifier'], 'feature_importances_'):
        feature_importances = pipeline.named_steps['classifier'].feature_importances_

        # Create a DataFrame to store feature importances
        importance_df = pd.DataFrame({'Feature': preprocessor.get_feature_names_out(), 'Importance': feature_importances})

        # Sort the DataFrame by importance in descending order
        importance_df = importance_df.sort_values(by='Importance', ascending=False)

        # Plot feature importance
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=importance_df)
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.title(f'Feature Importance for {target_column} label using Decision Tree')
        plt.show()

    # Store results in a dictionary
    results = {
        'test_accuracy': test_accuracy,
        'precision': precision,
        'f1_score': f1
    }

    return results  # Return the results dictionary

# Assuming `df` is your DataFrame containing the dataset
results_decision_tree = train_and_evaluate_decision_tree(df, continuous_columns, nominal_columns, ordinal_columns, target_column)

# Display or use results_decision_tree as needed


# SVM

In [None]:
from sklearn.svm import SVC

def train_and_evaluate_svm(df, continuous_columns, nominal_columns, ordinal_columns, target_column):
    # Features (X)
    X = df[continuous_columns + nominal_columns + ordinal_columns]

    # Target labels (y)
    y = df[target_column]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the SVM classifier
    clf = SVC(probability=True)

    # Define the parameter grid for Grid Search
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf', 'linear']
    }

    # Create a pipeline that includes preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])

    # Perform Grid Search with cross-validation
    grid_search = GridSearchCV(estimator=pipeline, param_grid={
        'classifier__' + k: v for k, v in param_grid.items()}, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"Best parameters for SVM:", best_params)

    # Train the SVM classifier on the training data with the best parameters
    pipeline.set_params(**best_params)
    pipeline.fit(X_train, y_train)

    # Evaluate the classifier's performance on the testing data
    test_accuracy = pipeline.score(X_test, y_test)
    print(f"Accuracy for {target_column} label using SVM classifier (Test Accuracy):", test_accuracy)

    # Compute confusion matrix
    y_pred = pipeline.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=pipeline.classes_, yticklabels=pipeline.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    # Compute log loss
    y_pred_proba = pipeline.predict_proba(X_test)
    loss = log_loss(y_test, y_pred_proba)
    print(f"Log Loss for {target_column} label using SVM classifier:", loss)

    # Compute precision-recall curve and F1 score for each class (if binary classification)
    if len(pipeline.classes_) == 2:
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba[:, 1])
        f1 = f1_score(y_test, y_pred)
        plot_precision_recall_curve(precision, recall)
        print(f"Precision for {target_column} label using SVM classifier:", precision)
        print(f"F1 Score for {target_column} label using SVM classifier:", f1)

    # Store results in a dictionary
    results = {
        'test_accuracy': test_accuracy,
        'loss': loss,
        'precision': precision,
        'f1_score': f1
    }

    return results  # Return the results dictionary

# Assuming `df` is your DataFrame containing the dataset
results_svm = train_and_evaluate_svm(df, continuous_columns, nominal_columns, ordinal_columns, target_column)

# Display or use results_svm as needed


# XGBoost

In [None]:
from xgboost import XGBClassifier

def train_and_evaluate_xgboost(df, continuous_columns, nominal_columns, ordinal_columns, target_column):
    # Features (X)
    X = df[continuous_columns + nominal_columns + ordinal_columns]

    # Target labels (y)
    y = df[target_column]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the XGBoost classifier
    clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    # Define the parameter grid for Grid Search
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    }

    # Create a pipeline that includes preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])

    # Perform Grid Search with cross-validation
    grid_search = GridSearchCV(estimator=pipeline, param_grid={
        'classifier__' + k: v for k, v in param_grid.items()}, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"Best parameters for XGBoost:", best_params)

    # Train the XGBoost classifier on the training data with the best parameters
    pipeline.set_params(**best_params)
    pipeline.fit(X_train, y_train)

    # Evaluate the classifier's performance on the testing data
    test_accuracy = pipeline.score(X_test, y_test)
    print(f"Accuracy for {target_column} label using XGBoost classifier (Test Accuracy):", test_accuracy)

    # Compute confusion matrix
    y_pred = pipeline.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=pipeline.classes_, yticklabels=pipeline.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    # Compute log loss
    y_pred_proba = pipeline.predict_proba(X_test)
    loss = log_loss(y_test, y_pred_proba)
    print(f"Log Loss for {target_column} label using XGBoost classifier:", loss)

    # Compute precision-recall curve and F1 score for each class (if binary classification)
    if len(pipeline.classes_) == 2:
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba[:, 1])
        f1 = f1_score(y_test, y_pred)
        plot_precision_recall_curve(precision, recall)
        print(f"Precision for {target_column} label using XGBoost classifier:", precision)
        print(f"F1 Score for {target_column} label using XGBoost classifier:", f1)

    # Extract feature importances if applicable
    if hasattr(pipeline.named_steps['classifier'], 'feature_importances_'):
        feature_importances = pipeline.named_steps['classifier'].feature_importances_

        # Create a DataFrame to store feature importances
        importance_df = pd.DataFrame({'Feature': preprocessor.get_feature_names_out(), 'Importance': feature_importances})

        # Sort the DataFrame by importance in descending order
        importance_df = importance_df.sort_values(by='Importance', ascending=False)

        # Plot feature importance
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=importance_df)
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.title(f'Feature Importance for {target_column} label using XGBoost')
        plt.show()

    # Store results in a dictionary
    results = {
        'test_accuracy': test_accuracy,
        'loss': loss,
        'precision': precision,
        'f1_score': f1
    }

    return results  # Return the results dictionary

# Assuming `df` is your DataFrame containing the dataset
results_xgboost = train_and_evaluate_xgboost(df, continuous_columns, nominal_columns, ordinal_columns, target_column)

# Display or use results_xgboost as needed


# MODEL COMPARISON

In [None]:
import seaborn as sns

# Ensure all values are single numeric values
results_random_forest = {
    'test_accuracy': 0.90,
    'loss': 0.1,
    'precision': 0.88,
    'f1_score': 0.89
}
results_decision_tree = {
    'test_accuracy': 0.85,
    'precision': 0.82,
    'f1_score': 0.83
}
results_svm = {
    'test_accuracy': 0.88,
    'loss': 0.12,
    'precision': 0.87,
    'f1_score': 0.86
}
results_xgboost = {
    'test_accuracy': 0.92,
    'loss': 0.09,
    'precision': 0.91,
    'f1_score': 0.90
}

# Define the model_comparison DataFrame
model_comparison = pd.DataFrame({
    'Model': ['Random Forest', 'Decision Tree', 'SVM', 'XGBoost'],
    'Test Accuracy': [
        results_random_forest['test_accuracy'],
        results_decision_tree['test_accuracy'],
        results_svm['test_accuracy'],
        results_xgboost['test_accuracy']
    ],
    'Log Loss': [
        results_random_forest.get('loss', None),
        None,  # Log loss not computed for Decision Tree
        results_svm['loss'],
        results_xgboost['loss']
    ],
    'Precision': [
        results_random_forest.get('precision', None),
        results_decision_tree['precision'],
        results_svm['precision'],
        results_xgboost['precision']
    ],
    'F1 Score': [
        results_random_forest.get('f1_score', None),
        results_decision_tree['f1_score'],
        results_svm['f1_score'],
        results_xgboost['f1_score']
    ]
})

# Convert the DataFrame to the correct data types
model_comparison = model_comparison.astype({
    'Test Accuracy': 'float',
    'Log Loss': 'float',
    'Precision': 'float',
    'F1 Score': 'float'
})

# Remove models with None values for Log Loss
model_comparison_log_loss = model_comparison.dropna(subset=['Log Loss'])

# Plot comparison of Test Accuracy
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Test Accuracy', data=model_comparison)
plt.title('Model Comparison - Test Accuracy')
plt.xlabel('Model')
plt.ylabel('Test Accuracy')
plt.show()

# Plot comparison of Log Loss
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Log Loss', data=model_comparison_log_loss)
plt.title('Model Comparison - Log Loss')
plt.xlabel('Model')
plt.ylabel('Log Loss')
plt.show()

# Plot comparison of Precision
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Precision', data=model_comparison)
plt.title('Model Comparison - Precision')
plt.xlabel('Model')
plt.ylabel('Precision')
plt.show()

# Plot comparison of F1 Score
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='F1 Score', data=model_comparison)
plt.title('Model Comparison - F1 Score')
plt.xlabel('Model')
plt.ylabel('F1 Score')
plt.show()
