In [1]:
import os
from pathlib import Path

os.chdir("/home/me/workspace/det_remota/trabalho_final")

In [2]:
import pandas as pd
import numpy as np
from typing import Union, List, Optional, Dict


def classification_report_from_confusion_matrix(
    confusion_matrix: pd.DataFrame,
    labels: Optional[List[Union[str, int]]] = None,
    digits: int = 2
) -> pd.DataFrame:
    """
    Compute classification metrics from a confusion matrix and return a DataFrame with the results.
    
    Parameters:
    ----------
    confusion_matrix : pd.DataFrame
        A pandas DataFrame representing the confusion matrix.
        Rows should represent actual classes and columns should represent predicted classes.
        
    labels : List[Union[str, int]], optional
        List of class labels. If None, will use the column names of the confusion matrix.
        
    digits : int, default=2
        Number of decimal places to round to in the output.
        
    Returns:
    -------
    pd.DataFrame
        A DataFrame containing the classification report with precision, recall, f1-score, and support for each class.
    """
    if labels is None:
        # Use column names if no labels are provided
        if isinstance(confusion_matrix.columns, pd.Index):
            labels = confusion_matrix.columns.tolist()
        else:
            labels = list(range(len(confusion_matrix.columns)))
    
    # Ensure confusion matrix is numpy array for calculations
    if isinstance(confusion_matrix, pd.DataFrame):
        cm = confusion_matrix.values
    else:
        cm = np.array(confusion_matrix)
    
    # Calculate metrics for each class
    n_classes = cm.shape[0]
    class_metrics = {}
    
    # Total samples
    total_samples = np.sum(cm)
    
    # Calculate global accuracy
    accuracy = np.trace(cm) / total_samples
    
    for i in range(n_classes):
        # True positives: diagonal element for this class
        tp = cm[i, i]
        
        # False positives: sum of column i (predicted as class i) minus true positives
        fp = np.sum(cm[:, i]) - tp
        
        # False negatives: sum of row i (actual class i) minus true positives
        fn = np.sum(cm[i, :]) - tp
        
        # True negatives: sum of all elements except those in row i or column i, plus tp
        tn = total_samples - (tp + fp + fn)
        
        # Support: number of actual samples in this class
        support = np.sum(cm[i, :])
        
        # Calculate metrics (handle division by zero)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        class_metrics[labels[i]] = {
            'precision': precision,
            'recall': recall,
            'f1-score': f1,
            'support': support
        }
    
    # Calculate macro averages
    macro_precision = np.mean([class_metrics[label]['precision'] for label in labels])
    macro_recall = np.mean([class_metrics[label]['recall'] for label in labels])
    macro_f1 = np.mean([class_metrics[label]['f1-score'] for label in labels])
    
    # Calculate weighted averages
    total_support = sum(class_metrics[label]['support'] for label in labels)
    weighted_precision = sum(class_metrics[label]['precision'] * class_metrics[label]['support'] 
                            for label in labels) / total_support
    weighted_recall = sum(class_metrics[label]['recall'] * class_metrics[label]['support'] 
                         for label in labels) / total_support
    weighted_f1 = sum(class_metrics[label]['f1-score'] * class_metrics[label]['support'] 
                     for label in labels) / total_support
    
    # Create the DataFrame for the classification report
    report_data = {}
    
    # Add metrics for each class
    for label in labels:
        report_data[label] = {
            'precision': round(class_metrics[label]['precision'], digits),
            'recall': round(class_metrics[label]['recall'], digits),
            'f1-score': round(class_metrics[label]['f1-score'], digits),
            'support': int(class_metrics[label]['support'])
        }
    
    # Add macro average
    report_data['macro avg'] = {
        'precision': round(macro_precision, digits),
        'recall': round(macro_recall, digits),
        'f1-score': round(macro_f1, digits),
        'support': total_support
    }
    
    # Add weighted average
    report_data['weighted avg'] = {
        'precision': round(weighted_precision, digits),
        'recall': round(weighted_recall, digits),
        'f1-score': round(weighted_f1, digits),
        'support': total_support
    }
    
    # Create the DataFrame
    report_df = pd.DataFrame(report_data).T
    
    # Add accuracy as a separate attribute
    report_df.attrs['accuracy'] = round(accuracy, digits)
    report_df.attrs['accuracy_count'] = f"{int(np.trace(cm))}/{total_samples}"
    
    return report_df


# Example usage
if __name__ == "__main__":
    # Example confusion matrix as a pandas DataFrame
    # Rows are true labels, columns are predicted labels
    cm = pd.DataFrame([
        [50, 10, 5],
        [8, 40, 12],
        [3, 7, 45]
    ], index=['Class A', 'Class B', 'Class C'], 
       columns=['Class A', 'Class B', 'Class C'])
    
    # Call the function and get the report DataFrame
    report_df = classification_report_from_confusion_matrix(cm)
    
    # Display the report DataFrame
    print("Classification Report as DataFrame:")
    print(report_df)
    
    # Display accuracy information from the DataFrame attributes
    print(f"\nAccuracy: {report_df.attrs['accuracy']} ({report_df.attrs['accuracy_count']})")

Classification Report as DataFrame:
              precision  recall  f1-score  support
Class A            0.82    0.77      0.79     65.0
Class B            0.70    0.67      0.68     60.0
Class C            0.73    0.82      0.77     55.0
macro avg          0.75    0.75      0.75    180.0
weighted avg       0.75    0.75      0.75    180.0

Accuracy: 0.75 (135/180)


In [3]:
report_df

Unnamed: 0,precision,recall,f1-score,support
Class A,0.82,0.77,0.79,65.0
Class B,0.7,0.67,0.68,60.0
Class C,0.73,0.82,0.77,55.0
macro avg,0.75,0.75,0.75,180.0
weighted avg,0.75,0.75,0.75,180.0


In [8]:
CONF_MATRIX_PATH = Path("data/qgis_outputs/dzetsaka_models")

conf_matrix_dict = {}

IDX_CLASS_MAPPING = {
    0: 'aquaculture',
    1: 'river_lake_ocean',
    2: 'forest', 
    3: 'farming',
    4: 'non_vegetated'
}

for conf_matrix_file in CONF_MATRIX_PATH.glob('*.conf_matrix'):
    model_name = str(conf_matrix_file).split('/')[-1].replace('.conf_matrix', '')
    conf_matrix_dict[model_name] = pd.read_csv(
        conf_matrix_file,
        skiprows=1, 
        header=None
    )
    conf_matrix_dict[model_name].rename(
        columns=IDX_CLASS_MAPPING,
        index=IDX_CLASS_MAPPING, 
        inplace=True
    )

print(conf_matrix_dict['gaussian'].to_markdown())

|                  |   aquaculture |   river_lake_ocean |   forest |   farming |   non_vegetated |
|:-----------------|--------------:|-------------------:|---------:|----------:|----------------:|
| aquaculture      |          5456 |                396 |       15 |         0 |            8021 |
| river_lake_ocean |           660 |              13414 |        0 |         0 |            1568 |
| forest           |             0 |                 23 |    59120 |        86 |            1527 |
| farming          |             0 |                  0 |     1186 |     30727 |           36686 |
| non_vegetated    |            92 |                 78 |      238 |       238 |          194539 |


In [5]:
for model_name, conf_mat_df in conf_matrix_dict.items():
    # Call the function and get the report DataFrame
    report_df = classification_report_from_confusion_matrix(conf_mat_df)
    
    print("\n---------------------------------------------------\n")
    # Display the report DataFrame
    print(f"Classification Report for model {model_name}:")
    print(report_df)
    
    # Display accuracy information from the DataFrame attributes
    print(f"\nAccuracy: {report_df.attrs['accuracy']} ({report_df.attrs['accuracy_count']})")
    print("\n---------------------------------------------------\n")


---------------------------------------------------

Classification Report for model gaussian:
                  precision  recall  f1-score   support
aquaculture            0.88    0.39      0.54   13888.0
river_lake_ocean       0.96    0.86      0.91   15642.0
forest                 0.98    0.97      0.97   60756.0
farming                0.99    0.45      0.62   68599.0
non_vegetated          0.80    1.00      0.89  195185.0
macro avg              0.92    0.73      0.79  354070.0
weighted avg           0.88    0.86      0.84  354070.0

Accuracy: 0.86 (303256/354070)

---------------------------------------------------

