In [25]:
import sys
sys.path.append("./../")


In [26]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd

In [27]:
# from esg_classification.src.ESGPredictor import ESGPredictor
# from esg_rating.src.esg_sentiment_predictor import ESGSentimentPredictor


In [28]:
nyon2022_results_path = "../esg_rating/src/data/csv_data/nyon_2022/prediction_results"
nyon2023_results_path = "../esg_rating/src/data/csv_data/nyon_2023/prediction_results"
vevey2022_results_path = "../esg_rating/src/data/csv_data/vevey_2022/prediction_results"
vevey2023_results_path = "../esg_rating/src/data/csv_data/vevey_2023/prediction_results"

In [37]:
def plot_histograms(input_dir, column='sentiment_pred', title="Histograms", k_subplots=3, save_fig=None, display=False):
    """
    Plots histograms for a specified column from CSV files within a given directory.

    Args:
    - input_dir (str): Directory containing CSV files.
    - column (str, optional): Column name for which the histogram is to be plotted. Defaults to 'sentiment_pred'.
    - title (str, optional): Title for the entire plot. Defaults to an empty string.
    - k_subplots (int, optional): Number of subplots per row. Defaults to 3.
    - save_fig (str, optional): Directory to save the figure. If None, the figure is not saved. Defaults to None.
    - display (bool, optional): Whether to display the plot. Defaults to False.
    """
    files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
    
    if not files:
        print("No CSV files found in the directory.")
        return

    nrows, ncols = calculate_grid_dimensions(len(files), k_subplots)
    fig, axs = initialize_plot(nrows, ncols, title)

    for i, file in enumerate(files):
        file_path = os.path.join(input_dir, file)
        try:
            plot_file_data(axs, file_path, column, i, nrows, ncols)
        except pd.errors.EmptyDataError:
            print(f"File {file} is empty or invalid. Skipping...")
            continue
        except KeyError:
            print(f"Column {column} not found in the file {file}. Skipping...")
            continue

    adjust_subplot_layout(nrows, ncols, i, axs)
    
    if save_fig: save_plot(fig, input_dir, column, save_fig)
        
    if display: plt.show()
    else: plt.close()

def calculate_grid_dimensions(len_files, k_subplots):
    nrows = len_files // k_subplots
    ncols = k_subplots
    if len_files % k_subplots > 0:
        nrows += 1
    return nrows, ncols

def initialize_plot(nrows, ncols, title):
    fig, axs = plt.subplots(nrows, ncols, figsize=(15, 15), sharex=False)
    fig.suptitle(title, fontsize=16)
    return fig, axs

def plot_file_data(axs, file_path, column, file_index, nrows, ncols):
    df = pd.read_csv(file_path, encoding="utf-16")
    
    ax = axs[file_index // ncols, file_index % ncols] if nrows > 1 else axs[file_index]
    
    # Differentiate between sentiment_pred and classified data plotting
    if column == "sentiment_pred":
        # Assuming sentiment_pred is numerical and continuous
        df[column].plot(kind='hist', bins=20, ax=ax)
        customize_sentiment_pred_plot(ax)  # Apply specific customization for sentiment_pred
    elif column in df.columns:
        # Handle categorical data for "esg_predictor"
        categories_order = ["non-esg", "environmental", "social", "governance"]
        df[column] = pd.Categorical(df[column], categories=categories_order, ordered=True)
        df[column].value_counts().reindex(categories_order).plot(kind='bar', ax=ax)
        customize_esg_predictor_plot(ax)  # Apply specific customization for esg_predictor

    ax.set_title(os.path.basename(file_path))

def customize_sentiment_pred_plot(ax):
    ax.set_xlabel('Sentiment Score')
    ax.set_ylabel('Frequency')
    ax.set_xlim(0, 5)  # Ensure x-axis aligns with the range of sentiment scores
    plt.setp(ax.patches, edgecolor='black')
    plt.tight_layout()  # Adjust layout to prevent label overlap



def customize_esg_predictor_plot(ax):
    plt.setp(ax.get_xticklabels(), rotation=45)
    plt.setp(ax.patches, linewidth=0)
    plt.setp(ax.patches, width=0.5)
    plt.setp(ax.patches, edgecolor='black')
    plt.tight_layout()

def adjust_subplot_layout(nrows, ncols, last_file_index, axs):
    # Adjust for the case when there are fewer files than the number of subplots
    for j in range(last_file_index+1, nrows*ncols):
        axs.flat[j].set_visible(False)

def save_plot(fig, input_dir, column, save_fig):
    save_path = os.path.join(save_fig, f"{input_dir.split('/')[-2]}_{column}_histograms.png")
    fig.savefig(save_path)
    print(f"Figure saved to {save_path}")


    

In [38]:
plot_histograms(nyon2022_results_path, column = "sentiment_pred", title = 'Rating distribution of Nyon 2022 PVs', k_subplots=3, save_fig=nyon2022_results_path)
plot_histograms(nyon2023_results_path, column = "sentiment_pred", title = 'Rating distribution of Nyon 2023 PVs', k_subplots=3, save_fig=nyon2023_results_path)
plot_histograms(vevey2022_results_path,column = "sentiment_pred", title = 'Rating distribution of Vevey 2022 PVs', k_subplots=3, save_fig=vevey2022_results_path)
plot_histograms(vevey2023_results_path,column = "sentiment_pred", title = 'Rating distribution of Vevey 2023 PVs', k_subplots=3, save_fig=vevey2023_results_path)

Figure saved to ../esg_rating/src/data/csv_data/nyon_2022/prediction_results\nyon_2022_sentiment_pred_histograms.png
Figure saved to ../esg_rating/src/data/csv_data/nyon_2023/prediction_results\nyon_2023_sentiment_pred_histograms.png
Figure saved to ../esg_rating/src/data/csv_data/vevey_2022/prediction_results\vevey_2022_sentiment_pred_histograms.png
Figure saved to ../esg_rating/src/data/csv_data/vevey_2023/prediction_results\vevey_2023_sentiment_pred_histograms.png


In [31]:
plot_histograms(nyon2022_results_path,column = 'esg_predictor', title = 'Class distribution of Nyon 2022 PVs', k_subplots=3, save_fig=nyon2022_results_path)
plot_histograms(nyon2023_results_path,column = 'esg_predictor', title = 'Class distribution of Nyon 2023 PVs', k_subplots=3, save_fig=nyon2023_results_path)
plot_histograms(vevey2022_results_path,column = 'esg_predictor', title =  'Class distribution of Vevey 2022 PVs', k_subplots=3, save_fig=vevey2022_results_path)
plot_histograms(vevey2023_results_path,column = 'esg_predictor', title =  'Class distribution of Vevey 2023 PVs', k_subplots=3, save_fig=vevey2023_results_path)

Figure saved to ../esg_rating/src/data/csv_data/nyon_2022/prediction_results\nyon_2022_esg_predictor_histograms.png
Figure saved to ../esg_rating/src/data/csv_data/nyon_2023/prediction_results\nyon_2023_esg_predictor_histograms.png
Figure saved to ../esg_rating/src/data/csv_data/vevey_2022/prediction_results\vevey_2022_esg_predictor_histograms.png
Figure saved to ../esg_rating/src/data/csv_data/vevey_2023/prediction_results\vevey_2023_esg_predictor_histograms.png


In [32]:

def plot_average_sentiment(df, class_label_column='esg_predictor', sentiment_column='sentiment_pred', display=True, save_fig=None):
    # Calculate the mean sentiment for each class label
    avg_sentiment = df.groupby(class_label_column)[sentiment_column].mean()
    
    # Sort the class labels based on their average sentiment
    avg_sentiment = avg_sentiment.sort_values()

    # Plotting
    fig, ax = plt.subplots(figsize=(10, 6))
    avg_sentiment.plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
    ax.set_title('Average Sentiment Rating by Class Label')
    ax.set_xlabel('Class Label')
    ax.set_ylabel('Average Sentiment Rating')
    ax.grid(axis='y', linestyle='--', linewidth=0.7)
    
    plt.xticks(rotation=45)
    plt.tight_layout()  # Adjust layout to make room for the rotated x-axis labels
    plt.show()

# df = pd.read_csv("../esg_rating/src/data/csv_data/nyon_2022/prediction_results/ccpv220530.csv", encoding="utf-16")


def plot_average_sentiment_all(input_dir, class_label_column='esg_predictor', sentiment_column='sentiment_pred', save_fig=None, display=False):
    files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
    len_files = len(files)

    if len_files == 0:
        print("No CSV files found in the directory.")
        return

    avg_sentiments = []
    for file in files:
        df = pd.read_csv(os.path.join(input_dir, file), encoding="utf-16")
        avg_sentiment = df.groupby(class_label_column)[sentiment_column].mean()
        avg_sentiments.append(avg_sentiment)

    avg_sentiments = pd.concat(avg_sentiments, axis=1)
    avg_sentiments.columns = files

    # Plotting
    fig, ax = plt.subplots(figsize=(10, 6))
    avg_sentiments.plot(kind='bar', ax=ax, color=sns.color_palette('tab10', n_colors=len_files), edgecolor='black')
    ax.set_title('Average Sentiment Rating by Class Label')
    ax.set_xlabel('Class Label')
    ax.set_ylabel('Average Sentiment Rating')
    ax.grid(axis='y', linestyle='--', linewidth=0.7)
    ax.legend(title='File Name', loc='upper left', bbox_to_anchor=(1, 1))

    plt.xticks(rotation=45)
    plt.tight_layout()  # Adjust layout to make room for the rotated x-axis labels
    
    if save_fig is not None:
        save_path = os.path.join(save_fig, f"{input_dir.split('/')[-2]}_avg_sentiment_by_class_label.png")
        plt.savefig(save_path)
        print(f"Figure saved to {save_path}")
    
    if display: plt.show()
    else: plt.close()


    
    

In [33]:
plot_average_sentiment_all(nyon2022_results_path, save_fig=nyon2022_results_path)
plot_average_sentiment_all(nyon2023_results_path, save_fig=nyon2023_results_path)
plot_average_sentiment_all(vevey2022_results_path, save_fig=vevey2022_results_path)
plot_average_sentiment_all(vevey2023_results_path, save_fig=vevey2023_results_path)

Figure saved to ../esg_rating/src/data/csv_data/nyon_2022/prediction_results\nyon_2022_avg_sentiment_by_class_label.png
Figure saved to ../esg_rating/src/data/csv_data/nyon_2023/prediction_results\nyon_2023_avg_sentiment_by_class_label.png
Figure saved to ../esg_rating/src/data/csv_data/vevey_2022/prediction_results\vevey_2022_avg_sentiment_by_class_label.png
Figure saved to ../esg_rating/src/data/csv_data/vevey_2023/prediction_results\vevey_2023_avg_sentiment_by_class_label.png


In [36]:
df = pd.read_csv("../esg_rating/src/data/csv_data/nyon_2023/prediction_results/ccpv231113.csv", encoding="utf-16")
df

Unnamed: 0,section_number,text,esg_predictor,sentiment_pred
0,1,1. \n\nAppel : \n\n87 Conseillères et Conseil...,non-esg,1.346
1,2,2. \n\nProcès-verbal de la séance du 2 octobre...,non-esg,1.936
2,3,3. \n\nApprobation de l’ordre du jour \n\nM. l...,environnemental,1.781
3,4,4. \n\nCommunications du Bureau \n\n• Le Bure...,environnemental,1.373
4,4,"Monge au sujet de l’hydrothermie, comme annonc...",environnemental,1.552
...,...,...,...,...
70,12,12. \n\nRapport-préavis N° 2023/122 \nRéponse ...,non-esg,1.728
71,13,13. \n\nPréavis N° 2023/123 \nPlace Saint-Mar...,non-esg,1.506
72,14,14. \n\nPréavis N° 2023/124 \nBudget 2024 - Bu...,non-esg,1.675
73,15,15. \n\nRéponse à l’interpellation de Mme Mari...,environnemental,2.247
