<a href="https://colab.research.google.com/github/txusser/Master_IA_Sanidad/blob/main/Modulo_2/2_3_4_Modelado_y_evaluaci%C3%B3n_de_resultados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Modeling and Performance Metrics


In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from rich.console import Console
console = Console()

# Visualization setup
sns.set_theme(style="darkgrid")
sns.set_palette("pastel")

In [None]:
# We will work with the diabetes dataset from Scikit-learn
from sklearn.datasets import load_diabetes

# Load the diabetes dataset
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

# Convert the target into a binary classification problem
# Consider values above the median as "diabetes positive"
y_binary = (y > np.median(y)).astype(int)

# Create a DataFrame with feature names
feature_names = diabetes.feature_names
data = pd.DataFrame(X, columns=feature_names)
data['diabetes'] = y_binary

In [None]:
# Create a figure with subplots for the boxplots
plt.figure(figsize=(15, 10))

# Number of features
n_features = len(feature_names)
rows = (n_features + 1) // 2
cols = 2

# Create boxplots for each feature
for i, feature in enumerate(feature_names, 1):
    plt.subplot(rows, cols, i)
    sns.boxplot(x='diabetes', y=feature, data=data)
    plt.title(f'Distribution of {feature} by Class')
    plt.xlabel('Diabetes (0=Negative, 1=Positive)')

plt.tight_layout()
plt.show()


In [None]:
# Display class balance
class_balance = data['diabetes'].value_counts(normalize=True) * 100

plt.figure(figsize=(8, 6))
sns.barplot(x=class_balance.index, y=class_balance.values)
plt.title('Class Balance in the Dataset')
plt.xlabel('Class')
plt.ylabel('Percentage')
plt.xticks([0, 1], ['Negative', 'Positive'])

# Add percentage labels on the bars
for i, v in enumerate(class_balance.values):
    plt.text(i, v + 1, f'{v:.1f}%', ha='center')

plt.show()


In [None]:
# Print class balance statistics
print("\nClass Balance Statistics:")
print(f"Class 0 (Negative): {class_balance[0]:.1f}%")
print(f"Class 1 (Positive): {class_balance[1]:.1f}%")

# Calculate and display descriptive statistics by class
print("\nDescriptive Statistics by Class:")
for feature in feature_names:
    console.rule(f"{feature.upper()}")
    print(data.groupby('diabetes')[feature].describe())


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def visualize_train_test_split(X_train, X_test, y_train, y_test):
    """
    Visualize the distribution of training and testing sets,
    including set sizes and class distributions.

    Args:
    X_train, X_test, y_train, y_test: Split datasets
    """
    plt.figure(figsize=(16, 12))

    # Textual information
    print("Dataset Shapes:")
    print(f" - X_train: {X_train.shape}")
    print(f" - X_test: {X_test.shape}")
    print(f" - y_train: {y_train.shape}")
    print(f" - y_test: {y_test.shape}")

    # Set sizes
    sizes = {
        'Train': X_train.shape[0],
        'Test': X_test.shape[0]
    }

    # Class distribution
    train_class_dist = np.bincount(y_train)
    test_class_dist = np.bincount(y_test)

    # Bar plot for set sizes
    plt.subplot(2, 2, 1)
    sns.barplot(x=list(sizes.keys()), y=list(sizes.values()))
    plt.title('Set Sizes')
    plt.ylabel('Number of Instances')

    # Pie chart for Train/Test proportion
    plt.subplot(2, 2, 2)
    plt.pie(sizes.values(), labels=sizes.keys(), autopct='%1.1f%%', startangle=90)
    plt.title('Train/Test Proportion')

    # Pie chart for class distribution in Train
    plt.subplot(2, 2, 3)
    plt.pie(train_class_dist, labels=[f'Class {i}' for i in range(len(train_class_dist))],
            autopct='%1.1f%%', startangle=90)
    plt.title('Class Distribution in Train')

    # Pie chart for class distribution in Test
    plt.subplot(2, 2, 4)
    plt.pie(test_class_dist, labels=[f'Class {i}' for i in range(len(test_class_dist))],
            autopct='%1.1f%%', startangle=90)
    plt.title('Class Distribution in Test')

    plt.tight_layout()
    plt.show()

    # Additional information
    print("\nStatistics:")
    print(f"Train/Test Ratio: {sizes['Train'] / (sizes['Train'] + sizes['Test']):.2f}")
    print("\nClass Distribution:")
    for i in range(len(train_class_dist)):
        print(f"Class {i}:")
        print(f" - Train: {train_class_dist[i]} ({train_class_dist[i]/sum(train_class_dist)*100:.1f}%)")
        print(f" - Test: {test_class_dist[i]} ({test_class_dist[i]/sum(test_class_dist)*100:.1f}%)")

# Visualize the training and testing split
visualize_train_test_split(X_train, X_test, y_train, y_test)


In [None]:
def visualize_train_test_distributions_for_features(X_train, X_test, feature_names):
    """
    Visualizes the distribution of features in the training and testing sets.

    Args:
    X_train (numpy.ndarray): Training dataset
    X_test (numpy.ndarray): Testing dataset
    feature_names (list): List of feature names
    """
    n_features = X_train.shape[1]
    n_rows = (n_features + 1) // 2  # Calculate the number of rows needed

    fig, axes = plt.subplots(n_rows, 2, figsize=(15, 4 * n_rows))
    fig.suptitle('Feature Distributions in Training and Testing Sets', fontsize=16)

    for i, feature in enumerate(feature_names):
        row = i // 2
        col = i % 2
        ax = axes[row, col] if n_rows > 1 else axes[col]

        sns.kdeplot(X_train[:, i], ax=ax, label='Train', fill=True)
        sns.kdeplot(X_test[:, i], ax=ax, label='Test', fill=True)

        ax.set_title(f'Distribution of {feature}')
        ax.set_xlabel(feature)
        ax.set_ylabel('Density')
        ax.legend()

    # If the number of features is odd, remove the last empty subplot
    if n_features % 2 != 0:
        fig.delaxes(axes[-1, -1])

    plt.tight_layout()
    plt.subplots_adjust(top=0.95)  # Adjust space for the main title
    plt.show()

# Call the function to visualize the feature distributions
visualize_train_test_distributions_for_features(X_train, X_test, feature_names)


In [None]:
# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)
y_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Diabetes')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate and visualize the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Diabetes Prediction')
plt.legend(loc="lower right")
plt.show()


In [None]:
# Calculate specific metrics
tn, fp, fn, tp = cm.ravel()

precision = tp / (tp + fp)
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
f1_score = 2 * (precision * sensitivity) / (precision + sensitivity)

print(f"Precision: {precision:.2f}")
print(f"Sensitivity (Recall): {sensitivity:.2f}")
print(f"Specificity: {specificity:.2f}")
print(f"F1-Score: {f1_score:.2f}")

print(f"- The model correctly identifies {sensitivity*100:.1f}% of positive diabetes cases (sensitivity).")
print(f"- Of the cases predicted as diabetes positive by the model, {precision*100:.1f}% are actually positive (precision).")
print(f"- The model correctly identifies {specificity*100:.1f}% of negative diabetes cases (specificity).")
print(f"- The F1-Score of {f1_score:.2f} indicates a balance between precision and sensitivity.")
print(f"- The AUC-ROC of {roc_auc:.2f} suggests the model's discriminative capability for diabetes.")


# Interpretation of Results:
* The model correctly identifies 67.5% of positive diabetes cases (sensitivity).
* Of the cases predicted as diabetes positive, 69.2% are actually positive (precision).
* The model correctly identifies 75.5% of negative diabetes cases (specificity).
* The F1-Score of 0.68 indicates a balance between precision and sensitivity.
* The AUC-ROC of 0.81 suggests a good, though not excellent, discriminative capability for diabetes prediction.

## Conclusions

1. **Overall Performance**:
   - The model demonstrates moderate performance in predicting diabetes, with an AUC-ROC of 0.81 indicating a good discriminative capability.

2. **Balance Between Sensitivity and Specificity**:
   - Sensitivity (67.5%) and specificity (75.5%) are relatively balanced, with a slight inclination towards correctly identifying negative cases.
   - This balance suggests the model is somewhat conservative in predicting positive cases.

3. **F1-Score**:
   - The F1-Score of 0.68 confirms a reasonable balance between precision and sensitivity, though there is room for improvement.

## Suggestions for Improvement

1. **Threshold Adjustment**:
   - Experiment with different classification thresholds to achieve a better trade-off between sensitivity and specificity, especially if prioritizing the detection of positive cases.

2. **Feature Engineering**:
   - Create new features or transform existing ones to better capture predictive patterns for diabetes.

3. **Data Augmentation**:
   - If the dataset is small, consider data augmentation techniques to enhance model generalization.

4. **Class Imbalance Handling**:
   - If there is significant class imbalance, consider using techniques such as SMOTE to oversample the minority class.

5. **Cross-Validation**:
   - Use cross-validation to obtain a more robust estimate of model performance and avoid overfitting.

6. **Domain Knowledge Integration**:
   - Consult diabetes experts to incorporate domain-specific knowledge in feature selection and creation.

7. **Error Analysis**:
   - Conduct a detailed analysis of misclassified cases to identify patterns or subgroups where the model underperforms.

8. **Collect More Data**:
   - If feasible, collect more data or incorporate additional data sources to enrich the training set.


In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance in Diabetes Prediction')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

print("\nFeature Importance:")
print(feature_importance)

# Probability Distribution Analysis:
The distribution of predicted probabilities can help understand how the model is classifying cases. A bimodal distribution could indicate good separation between classes. If the distribution is skewed towards one end, it may be necessary to adjust the classification threshold.


In [None]:
# Additional Analysis: Predicted Probability Distribution
plt.figure(figsize=(10, 6))
sns.histplot(y_pred_proba, bins=30, kde=True)
plt.title('Predicted Probability Distribution for Diabetes')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.show()

Final Comment on Interpretation in the Context of Diabetes:

- Sensitivity is crucial for correctly identifying patients at risk of diabetes.
- Specificity helps avoid false positives that could lead to unnecessary tests or treatments.
- The balance between sensitivity and specificity should be adjusted based on the consequences of false positives versus false negatives in the context of diabetes.
- Feature importance can guide medical professionals on which factors are most relevant to diabetes risk, aiding in the prevention and management of the disease.
- This model could serve as an initial screening tool but should not replace professional medical diagnosis.
- Given the critical nature of diabetes detection, the model could be adjusted to prioritize higher sensitivity, accepting a possible increase in false positives that can be ruled out through follow-up examinations.

Finally: It is crucial to validate any model improvements (updates) using independent test data and, ultimately, in a real clinical environment before deployment.
