In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Class responsible for processing anomaly scores
class AnomalyScoreProcessor:
    # Initialization method that loads the data and sets the output files
    def __init__(self, input_file, output_key_file, output_value_file):
        # Reads the input CSV file and stores it in a dataframe (self.data)
        self.data = pd.read_csv(input_file, sep=';')
        # Stores the paths for the output key and value files
        self.output_key_file = output_key_file
        self.output_value_file = output_value_file

    # Private method to create a dataframe from a specific score column
    def _create_score_df(self, score_col, label):
        # Create a dataframe with 'id' and the score column ('score_col')
        df = pd.DataFrame({
            'id': self.data['id'],
            'pontuacao': self.data[score_col]
        }).sort_values(by='pontuacao', ascending=False)  # Sort the data by score in descending order
        
        # Create a key dataframe that contains a list of IDs, sorted by score, with the column name as 'label'
        df_key = pd.DataFrame({label: df['id'].tolist()})
        
        # Create a value dataframe that contains 'id' and their respective scores, labeled with 'label'
        df_valor = pd.DataFrame({
            'id': self.data['id'],
            label: self.data[score_col]
        })
        # Return both the key and value dataframes
        return df_key, df_valor

    # Private method to create dataframes with anomaly predictions and anomaly scores
    def _create_anomaly_df(self, pred_col, score_col, label):
        # Create a dataframe with 'id', prediction column ('pred_col'), and score column ('score_col')
        df = pd.DataFrame({
            'id': self.data['id'],
            'predicao': self.data[pred_col],
            'pontuacao': self.data[score_col]
        }).sort_values(['predicao', 'pontuacao'])  # Sort by prediction and score
        
        # Create a new column 'y_pred', where anomaly (-1) is mapped to 1, and normal (1) to 0
        df['y_pred'] = np.where(df['predicao'] == -1, 1, 0)
        
        # Assign a rank to each row (descending order) based on the index position
        df['scores'] = range(len(df), 0, -1)
        
        # Normalize the scores by dividing them by the maximum score
        df['score'] = df['scores'] / np.max(df['scores'])
        
        # Create a value dataframe containing 'id' and the normalized score, labeled as 'label'
        df_valor = pd.DataFrame({
            'id': self.data['id'],
            label: df['score']
        })
        
        # Create a key dataframe with the IDs sorted by score, labeled as 'label'
        df_key = pd.DataFrame({label: df['id'].tolist()})
        
        # Return both the key and value dataframes
        return df_key, df_valor

    # Method to process and combine different anomaly scores and predictions
    def process_scores(self):
        # Create score dataframes for OS1 and OS2 columns
        df_key, df_valor = self._create_score_df('OS1', 'OS1')
        df_key2, df_valor2 = self._create_score_df('OS2', 'OS2')
        
        # Join OS1 and OS2 dataframes (keys and values)
        df_key = df_key.join(df_key2, how='outer')
        df_valor = df_valor.join(df_valor2.set_index('id'), on='id', rsuffix='_OS2')

        # Dictionary of anomaly models with their respective prediction and score column names
        anomaly_models = {
            'IsF': ('anomaly-IsF', 'scores-IsF'),
            'LOF': ('anomaly-Lof', 'scores-Lof'),
            'COV': ('anomaly-Cov', 'scores-Cov'),
            'SVM': ('anomaly-SVM', 'scores-SVM')
        }
        
        # Iterate through each anomaly model, process it, and join the results to the main dataframe
        for label, (pred_col, score_col) in anomaly_models.items():
            df_key_anomaly, df_valor_anomaly = self._create_anomaly_df(pred_col, score_col, label)
            df_key = df_key.join(df_key_anomaly, how='outer')
            df_valor = df_valor.join(df_valor_anomaly.set_index('id'), on='id', rsuffix=f'_{label}')

        # Save the resulting key and value dataframes to CSV files
        df_key.to_csv(self.output_key_file, sep=';', index=False)
        df_valor.to_csv(self.output_value_file, sep=';', index=False)

# Main method to run the program
def main():
    # Create processor for human capital data, process it, and save results
    processor = AnomalyScoreProcessor('data/humanNet.csv', 'data/key_human.csv', 'data/value_human.csv')
    processor.process_scores()

    # Create processor for social capital data, process it, and save results
    processor = AnomalyScoreProcessor('data/socialNet.csv', 'data/key_social.csv', 'data/value_social.csv')
    processor.process_scores()

    # Create processor for mixed capital data, process it, and save results
    processor = AnomalyScoreProcessor('data/mixedNet.csv', 'data/key_mixed.csv', 'data/value_mixed.csv')
    processor.process_scores()
    
# Check if the script is being run directly
if __name__ == "__main__":
    # If so, run the main function
    main()

# Documentation for Anomaly Score Processor

This document provides a detailed explanation of the `AnomalyScoreProcessor` class and its associated methods for processing anomaly detection scores.

## Table of Contents
- [Overview](#overview)
- [Class: `AnomalyScoreProcessor`](#class-anomalyscoreprocessor)
  - [Method: `__init__`](#method-__init__)
  - [Method: `_create_score_df`](#method-_create_score_df)
  - [Method: `_create_anomaly_df`](#method-_create_anomaly_df)
  - [Method: `process_scores`](#method-process_scores)
- [Function: `main`](#function-main)
- [Execution](#execution)

---

## Overview

The `AnomalyScoreProcessor` class is designed to process anomaly detection scores from a CSV file. It generates dataframes to store keys (sorted IDs by scores) and values (normalized scores) and outputs them as CSV files.

---

## Class: `AnomalyScoreProcessor`

### Purpose:
Processes anomaly detection scores and predictions to generate key-value dataframes for various models.

### Attributes:
- `data`: A Pandas DataFrame loaded from the input file.
- `output_key_file`: Path for the output file containing sorted keys (IDs).
- `output_value_file`: Path for the output file containing score values.

---

### Method: `__init__`

#### Purpose:
Initializes the processor by loading the input data and setting output file paths.

#### Parameters:
- `input_file` (str): Path to the input CSV file.
- `output_key_file` (str): Path to save the key dataframe.
- `output_value_file` (str): Path to save the value dataframe.

---

### Method: `_create_score_df`

#### Purpose:
Generates dataframes for a given score column.

#### Parameters:
- `score_col` (str): Name of the column containing the scores.
- `label` (str): Label for the score type.

#### Returns:
- `df_key` (DataFrame): Dataframe containing sorted IDs as keys.
- `df_valor` (DataFrame): Dataframe containing IDs and their corresponding scores.

#### Details:
- Sorts the IDs by their scores in descending order.
- Creates a key dataframe with IDs and a value dataframe with the scores.

---

### Method: `_create_anomaly_df`

#### Purpose:
Processes anomaly predictions and scores to generate normalized dataframes.

#### Parameters:
- `pred_col` (str): Name of the column containing anomaly predictions.
- `score_col` (str): Name of the column containing anomaly scores.
- `label` (str): Label for the anomaly model.

#### Returns:
- `df_key` (DataFrame): Dataframe containing sorted IDs based on normalized scores.
- `df_valor` (DataFrame): Dataframe containing IDs and their normalized scores.

#### Details:
- Maps anomalies (-1) to `1` and normal values (1) to `0`.
- Assigns ranks to scores and normalizes them to create a consistent scale.

---

### Method: `process_scores`

#### Purpose:
Processes and combines multiple anomaly detection scores.

#### Workflow:
1. Processes scores for specific columns (`OS1`, `OS2`).
2. Iterates through predefined anomaly models:
   - `IsF`: Isolation Forest.
   - `LOF`: Local Outlier Factor.
   - `COV`: Covariance Estimator.
   - `SVM`: Support Vector Machine.
3. Combines dataframes for all models.
4. Outputs the combined results to CSV files.

#### Output:
- Saves `df_key` and `df_valor` as CSV files.

---

## Function: `main`

#### Purpose:
Executes the anomaly score processing for different datasets.

#### Workflow:
1. Processes human capital data from `data/humanNet.csv`.
2. Processes social capital data from `data/socialNet.csv`.
3. Processes mixed capital data from `data/mixedNet.csv`.

#### Output:
- Generates key and value CSV files for each dataset.

---

## Execution

The script checks if it is run directly using the `if __name__ == "__main__":` condition. When executed, the `main` function is called to process anomaly scores for different datasets.

---

## Notes

- **Input Format**: The input files must be CSVs with columns for IDs, predictions, and scores.
- **Output Files**: The processed data is saved as `;`-separated CSVs for compatibility.
- **Dependencies**: Requires `pandas` and `numpy` libraries.