In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from glob import glob
import os


## Config Session
### This session contains static information essential to the code

In [3]:
PATH_FILE = 'data/acc'
PATH_FILE_OUTPUT = 'data/accuracy.csv'

SELECTOR = [
    'NEW_PROB', 'year', 'border', 'PESO_AMOS','POINTEDITE',
    'AMOSTRAS','REINSP','NEW_PROB','ADJ_FACTOR','PESO_VOT'
]

REMAP_CLASSES = {
    6: 3
}

# 0 - no, 1 - yes
ACCEPT_BORDER = 0

YEARS = list(range(2021, 2022, 1))

CLASSES = {
    3: 'Forest',
    4: 'Shrubland',
    6: 'Flooded Forest',
    11: 'Wetland',
    12: 'Natural Grassland',
    15: 'Pastureland',
    18: 'Cropland',
    27: 'Bareland and Impervious',
    29: 'Rock Outcrop',
    33: 'Water'
}


## Input Data
### This session we load the input data, files to perform calculations

In [4]:
df_samples = pd.concat([pd.read_csv(x) for x in glob(PATH_FILE + '/*')])
df_samples = df_samples[[
    'reference', 'classification', 'year', 'border',
    'BIOMA', 'AMOSTRAS']]

# filter samples
df_samples = df_samples.loc[df_samples['reference'].isin(CLASSES.keys())]
df_samples = df_samples.query(f'border == {ACCEPT_BORDER}')
df_samples = df_samples.query('AMOSTRAS != "Treinamento" & BIOMA == "Amazônia"')

## Helper Functions
### Auxiliar functions

In [5]:
def calculate_metrics(confusion_matrix, classes):

    accuracy_user = {}
    accuracy_producer = {}
    accuracy_overall = {}
    precision = {}
    recall = {}
    f1_score = {}

    for class_name in classes.values():
        class_index = df.index.get_loc(class_name)

        true_positive = confusion_matrix.iloc[class_index, class_index]
        false_positive = confusion_matrix.iloc[class_index, :].sum() - true_positive
        false_negative = confusion_matrix.iloc[:, class_index].sum() - true_positive

        accuracy_user[class_name] = true_positive / (true_positive + false_positive) if (true_positive + false_positive) != 0 else 0
        accuracy_producer[class_name] = true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0
        accuracy_overall[class_name] = (true_positive + confusion_matrix.values.sum() - confusion_matrix.iloc[class_index, :].sum() - confusion_matrix.iloc[:, class_index].sum()) / confusion_matrix.values.sum()

        precision[class_name] = true_positive / (true_positive + false_positive) if (true_positive + false_positive) != 0 else 0
        recall[class_name] = true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0
        f1_score[class_name] = 2 * (precision[class_name] * recall[class_name]) / (precision[class_name] + recall[class_name]) if (precision[class_name] + recall[class_name]) != 0 else 0

    return accuracy_user, accuracy_producer, accuracy_overall, precision, recall, f1_score



## Get Metrics of Accuracy
### overall acc, producer, user, precision, recall, f1-score

In [7]:

for year in YEARS:



    df_samples_year = df_samples.query(f'year == {year}')

    # get reference and predicted lists
    y_true = np.array(df_samples_year[['reference']].values).flatten()
    y_pred = np.array(df_samples_year[['classification']].values).flatten()



    # get confusion matrix
    matrix = confusion_matrix(y_true, y_pred)
    matrix_transposed = matrix.transpose()

    # get total samples of each class
    total_samples = [sum(row) for row in matrix]


    # calculate percentages for each element in the confusion matrix
    percentage_data = [[value / total_samples[i] * 100 for value in row] for i, row in enumerate(matrix)]



    # create DataFrame with percentage data
    df = pd.DataFrame(
        percentage_data, 
        columns=[CLASSES[key] for key in CLASSES.keys()], index=[CLASSES[key] for key in CLASSES.keys()]
    )


    accuracy_user, accuracy_producer, accuracy_overall, precision, recall, f1_score = calculate_metrics(df, CLASSES)
    
    # add accuracy and metrics columns to the DataFrame
    df['Accuracy User'] = [round(accuracy_user[class_name], 2) for class_name in df.index]
    df['Accuracy Producer'] = [round(accuracy_producer[class_name], 2) for class_name in df.index]
    df['Accuracy Overall'] = [round(accuracy_overall[class_name], 2) for class_name in df.index]
    df['Precision'] = [round(precision[class_name], 2) for class_name in df.index]
    df['Recall'] = [round(recall[class_name], 2) for class_name in df.index]
    df['F1-Score'] = [round(f1_score[class_name], 2) for class_name in df.index]





           Forest  Shrubland  Flooded Forest   Wetland  Natural Grassland  \
Forest  93.932722    0.12844        3.944954  0.189602           0.385321   

        Pastureland  Cropland  Bareland and Impervious  Rock Outcrop  \
Forest     1.351682  0.018349                      0.0      0.030581   

           Water  Accuracy User  Accuracy Producer  Accuracy Overall  \
Forest  0.018349           0.94               0.41              0.76   

        Precision  Recall  F1-Score  
Forest       0.94    0.41      0.57  
