In [1]:
import pandas as pd
from transformers import pipeline
import matplotlib.pyplot as plt

## Sentiment Analysis by aggregating predictions

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


def normalize_label(label, score):
    """
    Convert labels to a unified numerical scale.
    For 'pos', 'neg', 'neutral', converts based on a predefined mapping.
    Assumes 5-star labels are already numerical.
    """
    label_map = {'positive': 5, 'neutral': 3, 'negative': 1}
    if label in label_map:
        return label_map[label] * score  
    else:
        star_rating = int(label.split()[0])
        return star_rating * score  

def aggregate_predictions(predictions):
    """
    Averages the normalized scores from all models.
    `predictions` is a list of tuples/lists with (label, score) from all models.
    """
    normalized_scores = [normalize_label(label, score) for label, score in predictions]
    if normalized_scores:
        return sum(normalized_scores) / len(normalized_scores)
    else:
        return None

def load_model(model_name: str):
    """
    Loads a pre-trained model from Hugging Face's model hub.
    """
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    return model, tokenizer

# Function to predict in batches
def predict_sentiment(texts ,model, tokenizer):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_indices = torch.argmax(predictions, dim=1)
    predicted_classes = [model.config.id2label[idx.item()] for idx in predicted_indices]
    all_probas = predictions.tolist()
    print(all_probas)
    probas = predictions.max(dim=1).values.tolist()
    
    return predicted_classes, probas




# Example usage
predictions = [
    ('positive', 0.8),  # From the sentiment model (label, confidence)
    ('4 stars', 0.9),  # From a 5-star rating model (label, confidence)
    # Add more as needed
]

final_score = aggregate_predictions(predictions)
print(f"Aggregated Score: {final_score}")


Aggregated Score: 3.8


In [8]:
# Load model directly
model_checkpoints = ["nlptown/bert-base-multilingual-uncased-sentiment", "bardsai/finance-sentiment-fr-base", "cmarkea/distilcamembert-base-sentiment"]


models = {
    "model_paths":  model_checkpoints,
    "model_names": ["bert", "finance", "camembert"],
    "models": [load_model(m)[0] for m in model_checkpoints],
    "tokenizers": [load_model(m)[1] for m in model_checkpoints]
}


In [9]:
df = pd.read_csv("./data/pv2312_512_predictions.csv")

In [10]:
texts = df['text'].tolist()

In [7]:
# for m,t,model_name in zip(models["models"], models["tokenizers"], models["model_names"]):
#     print(f"Predicting with {model_name}")
#     df[model_name] = predict_sentiment(texts, m, t)

Predicting with bert
Predicting with finance
Predicting with camembert


In [16]:
from tqdm import tqdm 

for i, row in tqdm(df.iterrows(), total=len(df)):
    bert_pred,bert_prob = predict_sentiment(row['text'], models["models"][0], models["tokenizers"][0])
    finance_pred,finance_prob = predict_sentiment(row['text'], models["models"][1], models["tokenizers"][1])
    camembert_pred,camembert_prob = predict_sentiment(row['text'], models["models"][2], models["tokenizers"][2])
    df.at[i, "bert_pred"] = bert_pred[0]
    df.at[i, "finance_pred"] = finance_pred[0]
    df.at[i, "camembert_pred"] = camembert_pred[0]

    df.at[i, "aggregate_prediction"] = aggregate_predictions([(bert_pred[0], bert_prob[0]), (finance_pred[0], finance_prob[0]), (camembert_pred[0], camembert_prob[0])])
    



100%|██████████| 266/266 [01:29<00:00,  2.98it/s]


In [17]:
df.to_csv("./results/pv2312_512_SA.csv", index=False)

In [19]:
bert_pred,bert_prob = predict_sentiment(df['text'][0], models["models"][0], models["tokenizers"][0])
finance_pred,finance_prob = predict_sentiment(df['text'][0], models["models"][1], models["tokenizers"][1])
camembert_pred,camembert_prob = predict_sentiment(df['text'][0], models["models"][2], models["tokenizers"][2])

print(f"BERT: {bert_pred[0]} with confidence {bert_prob[0]}")
print(f"Finance: {finance_pred[0]} with confidence {finance_prob[0]}")
print(f"Camembert: {camembert_pred[0]} with confidence {camembert_prob[0]}")


BERT: 1 star with confidence 0.24373239278793335
Finance: neutral with confidence 0.9998511075973511
Camembert: 5 stars with confidence 0.23412476480007172


In [3]:

df = pd.read_csv("../../esg_classification/notebooks/pv2312_512_predictions.csv")


df.head()

Unnamed: 0,section_number,text,cbl_pred,cbl_1024_pred
0,1,1. \n\nAppel : \n\n93 Conseillères et Conseil...,non-esg,gouvernance
1,2,2. \n\nProcès-verbal de la séance du 13 novemb...,non-esg,non-esg
2,3,3. \n\nApprobation de l’ordre du jour \n\nM. l...,non-esg,non-esg
3,4,4. \n\nCommunications du Bureau \n\n• M. le ...,non-esg,gouvernance
4,4,"Or, il a appris ce matin qu’un membre du Conse...",non-esg,gouvernance


In [4]:
nlp = pipeline("text-classification", model="cmarkea/distilcamembert-base-sentiment")

    PyTorch 2.1.1+cu121 with CUDA 1201 (you have 2.1.1+cpu)
    Python  3.11.6 (you have 3.11.2)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [9]:
res = nlp("je te hais")[0]['label']
print(res)
int(res[0])

1 star


1

In [10]:
df["sentiment"] = df["text"].apply(lambda x: int(nlp(x[:512])[0]["label"][0])) 

In [11]:
df.head()

Unnamed: 0,section_number,text,cbl_pred,cbl_1024_pred,sentiment
0,1,1. \n\nAppel : \n\n93 Conseillères et Conseil...,non-esg,gouvernance,5
1,2,2. \n\nProcès-verbal de la séance du 13 novemb...,non-esg,non-esg,4
2,3,3. \n\nApprobation de l’ordre du jour \n\nM. l...,non-esg,non-esg,4
3,4,4. \n\nCommunications du Bureau \n\n• M. le ...,non-esg,gouvernance,5
4,4,"Or, il a appris ce matin qu’un membre du Conse...",non-esg,gouvernance,4


In [12]:
df.to_csv("./sentiment_res_cmarkea.csv", index=False)

In [13]:
# count number of each instance in the sentiment column
sentiment_count = df["sentiment"].value_counts()
sentiment_count

sentiment
4    76
3    75
2    55
5    52
1     8
Name: count, dtype: int64

In [15]:
df2 = pd.read_csv("../../esg_classification/notebooks/pv2312_1024_predictions.csv")

In [16]:
df2.head()

Unnamed: 0,section_number,text,cbl_pred,cbl_long_pred
0,1,1. \n\nAppel : \n\n93 Conseillères et Conseil...,non-esg,gouvernance
1,2,2. \n\nProcès-verbal de la séance du 13 novemb...,non-esg,non-esg
2,3,3. \n\nApprobation de l’ordre du jour \n\nM. l...,non-esg,non-esg
3,4,4. \n\nCommunications du Bureau \n\n• M. le ...,non-esg,gouvernance
4,5,5. \n\nCommunications municipales \n\nM. le M...,environnemental,non-esg


In [17]:
df2["sentiment"] = df2["text"].apply(lambda x: int(nlp(x[:512])[0]["label"][0])) 

In [18]:
df2.head()

Unnamed: 0,section_number,text,cbl_pred,cbl_long_pred,sentiment
0,1,1. \n\nAppel : \n\n93 Conseillères et Conseil...,non-esg,gouvernance,5
1,2,2. \n\nProcès-verbal de la séance du 13 novemb...,non-esg,non-esg,4
2,3,3. \n\nApprobation de l’ordre du jour \n\nM. l...,non-esg,non-esg,4
3,4,4. \n\nCommunications du Bureau \n\n• M. le ...,non-esg,gouvernance,5
4,5,5. \n\nCommunications municipales \n\nM. le M...,environnemental,non-esg,3


In [20]:
df2.to_csv("./sentiment_res_cmarkea.csv", index=False)
sentiment_count = df2["sentiment"].value_counts()
sentiment_count


sentiment
4    36
3    34
5    30
2    27
1     2
Name: count, dtype: int64