In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler
import os
from IPython.display import display, HTML
import ipywidgets as widgets
import matplotlib.pyplot as plt
import networkx as nx

# Global variable for holding the selected network
network = None

# NetworkProcessor class handles selecting and processing network files
class NetworkProcessor:
    def __init__(self, directory='data', num_files=5):
        # Initialize with the directory containing files and a list of CSV files
        self.directory = directory
        self.file_list = [f"network{i}.csv" for i in range(1, num_files + 1)]
        self.output = widgets.Output()  # Output widget to display results
        self._create_widgets()  # Create the necessary widgets for interaction
    
    # Create buttons and layout for network file selection
    def _create_widgets(self):
        self.title_button = widgets.Button(
            description="Select the network you want to use:",
            disabled=True,  # Disable the button since it's just a title
            layout=widgets.Layout(width='auto')
        )
        self.buttons = [self._create_button(filename) for filename in self.file_list]  # Create buttons for each file
    
    # Create individual button for each network file
    def _create_button(self, filename):
        display_name = filename.replace('.csv', '')  # Display name without .csv extension
        button = widgets.Button(
            description=display_name,  # Display name on button
            tooltip=filename,  # Tooltip holds the filename
            layout=widgets.Layout(width='auto')
        )
        button.on_click(self._on_file_selected)  # Attach event listener
        return button
    
    # Event handler when a file is selected
    def _on_file_selected(self, button):
        global network  # Use the global 'network' variable
        network = button.tooltip  # Store the selected network file
        with self.output:
            display(HTML(f"<p>You selected the file: <strong>{network}</strong></p>"))  # Display selected file
            self._process_network_file()  # Process the selected network file
    
    # Display the widgets for file selection
    def display(self):
        style = """
        <style>
            .widget-button {
                background-color: teal !important;
                color: white !important;
            }
        </style>
        """  # Add custom CSS style for widgets
        display(HTML(style))
        display(widgets.VBox([self.title_button] + self.buttons))  # Display the title and buttons in a vertical box
        display(self.output)  # Display the output area
    
    # Process the selected network file
    def _process_network_file(self):
        global network
        if network:
            df_net = pd.read_csv(os.path.join(self.directory, network), sep=';', low_memory=False)  # Read network CSV
            novos_nomes = ['Source', 'Target']  # Rename columns to 'Source' and 'Target'
            df_net.columns = novos_nomes

            # Create a unique list of nodes
            df_unique = pd.DataFrame({'id': pd.unique(df_net[['Source', 'Target']].values.ravel())})
            df_unique.to_csv(os.path.join(self.directory, 'nodes.csv'), sep=';', index=False)  # Save the unique nodes list

            # Merge and save data for different capital types
            self._merge_and_save(df_unique, 'individuals')
            self._merge_and_save(df_unique, 'human')
            self._merge_and_save(df_unique, 'social')
            self._merge_and_save(df_unique, 'mixed')

            self._continue_processing()  # Continue with network analysis
    
    # Merge unique nodes with data and save the result
    def _merge_and_save(self, df_unique, filename):
        df = pd.read_csv(os.path.join(self.directory, f'{filename}.csv'), sep=';')  # Read corresponding CSV
        df_merged = df_unique.merge(df, on='id', how='inner')  # Merge with unique nodes
        df_merged.to_csv(os.path.join(self.directory, f'{filename}Net.csv'), sep=';', index=False)  # Save merged data
    
    # Continue with processing the selected network
    def _continue_processing(self):
        df = pd.read_csv(os.path.join('data', network), sep=';', low_memory=False)  # Read network CSV again

        G = nx.from_pandas_edgelist(df, source='Source', target='Target')  # Create a graph from the data

        network_name = os.path.splitext(network)[0]  # Extract network name without file extension

        # Display network information
        print("Network:", network_name)
        print(f"Nodes: {G.number_of_nodes()}")
        print(f"Edges: {G.number_of_edges()}")

        pos = nx.kamada_kawai_layout(G)  # Generate layout for graph

        # Plot the graph
        plot_graph(G, pos, "Network")

        # Save the selected network name to a text file
        with open('data/network.txt', 'w') as file:
            file.write(network)

        # File names for processing different capitals
        human_input_file = 'data/individualsNet.csv'
        human_output_file = 'data/humanNet.csv'
        human_columns_to_remove = ['id', 'Dc', 'Bc', 'Cc', 'CC', 'M']  # Columns to remove for human capital

        social_input_file = 'data/individualsNet.csv'
        social_output_file = 'data/socialNet.csv'
        social_columns_to_remove = ['id', 'nickname', 'gender', 'skin', 'hair', 'height', 'tattoo', 'age', 
                                    'weapon', 'arrested', 'convicted', 'rape', 'extortion', 'kidnapping', 'theft', 
                                    'homicide', 'arms_trafficking', 'drug_trafficking', 'faction']  # Columns for social capital
        
        mixed_input_file = 'data/individualsNet.csv'
        mixed_output_file = 'data/mixedNet.csv'
        mixed_columns_to_remove = ['id']  # Columns for mixed capital

        # Process the different types of capital
        process_capital(human_input_file, human_output_file, human_columns_to_remove)
        process_capital(social_input_file, social_output_file, social_columns_to_remove)
        process_capital(mixed_input_file, mixed_output_file, mixed_columns_to_remove)


# DistanceCalculator class provides methods to calculate different distance metrics
class DistanceCalculator:
    @staticmethod
    def euclidean_distance(matrix):
        return distance.cdist(matrix, matrix, metric='euclidean')  # Compute Euclidean distance between rows of the matrix

    @staticmethod
    def jensen_shannon_distance(matrix):
        return distance.cdist(matrix, matrix, metric='jensenshannon')  # Compute Jensen-Shannon distance


# ProbabilityCalculator class to calculate the probability distribution from distance matrices
class ProbabilityCalculator:
    @staticmethod
    def calculate_probability(distance_matrix):
        distance_matrix = [[x for idxx, x in enumerate(row) if idxx != idx] for idx, row in enumerate(distance_matrix)]
        distance_matrix = np.array(distance_matrix)

        max_distances = distance_matrix.max(axis=1)  # Find the maximum distances for each row
        min_distances = distance_matrix.min(axis=1)  # Find the minimum distances for each row

        N = len(distance_matrix)
        k = (N - 1) // 10  # Determine number of bins

        # Initialize probability matrix
        P = np.zeros((N, k), dtype=np.float64)
        for i in range(N):
            bins = np.linspace(min_distances[i], max_distances[i], k + 1)  # Create bins for histogram
            hist, _ = np.histogram(distance_matrix[i], bins=bins)  # Compute histogram
            P[i] = hist / (N - 1)  # Normalize histogram to create probability distribution
        return P


# DataProcessor class handles data normalization and saving results
class DataProcessor:
    def __init__(self, input_file, output_file, columns_to_remove):
        self.input_file = input_file
        self.output_file = output_file
        self.columns_to_remove = columns_to_remove
        self.data = None
        self.normalized_data = None

    # Load and normalize the data
    def load_data(self):
        self.data = pd.read_csv(self.input_file, sep=';')  # Load the data
        self.normalize_data()  # Normalize the data

    # Normalize the data excluding specified columns
    def normalize_data(self):
        columns = [c for c in self.data.columns if c not in self.columns_to_remove]  # Select columns for normalization
        X = self.data[columns]  # Extract the data
        scaler = StandardScaler()  # Initialize standard scaler
        self.normalized_data = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)  # Normalize and store data

    # Save the OS1 and OS2 probabilities to the output file
    def save_results(self, os1_probs, os2_probs):
        df = pd.read_csv(self.output_file, sep=';')  # Load the output data
        df['OS1'] = os1_probs  # Add OS1 column
        df['OS2'] = os2_probs  # Add OS2 column
        df.to_csv(self.output_file, sep=';', index=False)  # Save the results




# Function to process the data for different types of capital (human, social, mixed)
def process_capital(input_file, output_file, columns_to_remove):
    processor = DataProcessor(input_file, output_file, columns_to_remove)
    processor.load_data()  # Load and normalize the data

    # Calculate the Euclidean distance and normalize it
    mdisteuc = DistanceCalculator.euclidean_distance(processor.normalized_data)
    mdisteuc_norm = mdisteuc / mdisteuc.max()

    # Calculate probability distribution based on normalized Euclidean distance
    P = ProbabilityCalculator.calculate_probability(mdisteuc_norm)

    # Calculate Jensen-Shannon distance and normalize it
    mdistjen = DistanceCalculator.jensen_shannon_distance(P)
    mdistjen_norm = mdistjen / mdistjen.max()

    # Compute OS1 and OS2 probabilities
    os1_probs = mdisteuc_norm.sum(axis=1) / mdisteuc_norm.sum(axis=1).max()
    os2_probs = mdistjen_norm.sum(axis=1) / mdistjen_norm.sum(axis=1).max()

    # Save the results
    processor.save_results(os1_probs, os2_probs)


# Function to plot the network graph
def plot_graph(G, pos, title):
    num_nodes = G.number_of_nodes()  # Get the number of nodes

    plt.figure(figsize=(10, 8))  # Set the figure size

    # Customize plot based on the number of nodes
    if num_nodes <= 100:
        nx.draw(G, pos, with_labels=False, node_color='orchid', edge_color='black', 
                node_size=100, font_color='black', font_size=10, width=0.5)
        for node in G.nodes():  # Add node labels with only the first 3 characters of the node ID
            x, y = pos[node]
            plt.text(x, y, s=str(node)[:3], color='black', fontsize=10, ha='center', va='center')
    elif 100 < num_nodes < 250:
        nx.draw(G, pos, with_labels=False, node_color='orchid', edge_color='black', 
                node_size=100, font_color='black', font_size=10, width=0.5)
        for node in G.nodes():  # Add smaller font size for nodes
            x, y = pos[node]
            plt.text(x, y, s=str(node)[:3], color='black', fontsize=5, ha='center', va='center')
    else:
        # If more than 250 nodes, reduce the node size and remove labels
        reduced_node_size = 100 * 0.05
        nx.draw(G, pos, with_labels=False, node_color='orchid', edge_color='black', 
                node_size=reduced_node_size, font_color='black', font_size=10, width=0.5)

    # Create the figure folder if it doesn't exist
    if not os.path.exists('figure'):
        os.makedirs('figure')

    plt.title(title)
    plt.savefig(f'figure/network.png', dpi=300)  # Save the plot
    plt.show()


# Main Method - Program Execution
def main():
    selector = NetworkProcessor()  # Initialize network processor
    selector.display()  # Display the UI for network selection
    
# Check if the script is being run directly
if __name__ == "__main__":
    # If so, run the main function
    main()

# **Documentation for Network Analysis and Processing Script**

## **Overview**
This script provides tools for selecting, processing, and analyzing network data. It features interactive file selection, network visualization, and integration of human, social, and mixed capital data. The code leverages Python libraries such as **Pandas**, **NetworkX**, **Matplotlib**, and **SciPy** to handle data processing, distance metrics, and network graph visualization.

---

## **Code Structure**

### **1. Global Variables**
- **`network`**: A global variable that holds the selected network file.

---

### **2. Classes**

#### **2.1 `NetworkProcessor`**
Handles selecting and processing network files, as well as integrating and merging related datasets.

- **Attributes**:
  - `directory`: Directory containing network files (default: `'data'`).
  - `file_list`: List of network file names (e.g., `'network1.csv'`).
  - `output`: Output widget for displaying messages.

- **Methods**:
  - **`__init__(directory, num_files)`**: Initializes file list and widgets for file selection.
  - **`_create_widgets()`**: Creates UI buttons for selecting network files.
  - **`_create_button(filename)`**: Creates an individual button for each network file.
  - **`_on_file_selected(button)`**: Event handler for selecting a file.
  - **`display()`**: Displays the UI components for file selection.
  - **`_process_network_file()`**: Processes the selected network file and generates auxiliary datasets.
  - **`_merge_and_save(df_unique, filename)`**: Merges unique nodes with additional data and saves results.
  - **`_continue_processing()`**: Proceeds with network graph analysis and plotting.

---

#### **2.2 `DistanceCalculator`**
Provides methods for calculating distance metrics between data points.

- **Static Methods**:
  - **`euclidean_distance(matrix)`**: Computes the Euclidean distance between rows.
  - **`jensen_shannon_distance(matrix)`**: Computes the Jensen-Shannon distance between rows.

---

#### **2.3 `ProbabilityCalculator`**
Calculates probability distributions based on distance metrics.

- **Static Methods**:
  - **`calculate_probability(distance_matrix)`**: Converts a distance matrix into a probability distribution.

---

#### **2.4 `DataProcessor`**
Handles data normalization and saves processed results.

- **Attributes**:
  - `input_file`: Path to the input dataset.
  - `output_file`: Path to the output dataset.
  - `columns_to_remove`: List of columns to exclude during processing.
  - `data`: Loaded data.
  - `normalized_data`: Normalized data.

- **Methods**:
  - **`__init__(input_file, output_file, columns_to_remove)`**: Initializes file paths and column details.
  - **`load_data()`**: Loads and normalizes data.
  - **`normalize_data()`**: Normalizes numerical columns.
  - **`save_results(os1_probs, os2_probs)`**: Saves the computed probabilities to a CSV file.

---

### **3. Functions**

#### **3.1 `process_capital(input_file, output_file, columns_to_remove)`**
Processes a dataset based on the specified columns to remove and calculates OS1/OS2 probabilities.

#### **3.2 `plot_graph(G, pos, title)`**
Plots a network graph using **NetworkX** and saves the visualization.

---

### **4. Main Program**

#### **`main()`**
The entry point of the program. Initializes the `NetworkProcessor` and displays the UI for selecting a network file.

---

## **How to Run**
1. Ensure the required files are in the `data` directory:
   - `networkX.csv` (network files)
   - `individuals.csv`
   - `human.csv`
   - `social.csv`
   - `mixed.csv`