In [8]:
import pandas as pd
import numpy as np
import torch
import json
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from tqdm import tqdm

# === 1. Charger les actualités ===
with open("actualitesIlboursa.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
df = df.dropna(subset=["titre", "libelle", "datePublication"])
df["libelle"] = df["libelle"].str.strip().str.upper()

# === 2. Charger le modèle RoBERTa ===
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

# === 3. Lexique pour comptage mentions ===
lexique_positif = [
    "croissance", "hausse", "augmentation", "résultat positif", "bonne performance",
    "recommandation à l'achat", "profit", "dividende", "bénéfice", "optimisme", "amélioration"
]
lexique_negatif = [
    "baisse", "perte", "diminution", "résultat négatif", "alerte", "crise",
    "affaiblissement", "déclin", "recul", "mauvaise performance", "retrait"
]

# === 4. Fonction de prédiction ===
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = F.softmax(logits, dim=-1).squeeze().numpy()
    score = probs[2] - probs[0]
    subjectivity = 1 - probs[1]
    if score > 0.05:
        label = "POS"
    elif score < -0.05:
        label = "NEG"
    else:
        label = "NEU"
    return score, subjectivity, label

# === 5. Analyse des actualités ===
articles = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    entreprise = row["libelle"]
    try:
        date = datetime.strptime(row["datePublication"][:10], "%Y-%m-%d").date()
        if date.year > 2100:
            continue
    except:
        continue

    titre = row.get("titre", "")
    contenu = row.get("contenu", "")
    texte = f"{titre} {contenu}".strip().lower()
    if not texte:
        continue

    nb_pos = sum(texte.count(mot) for mot in lexique_positif)
    nb_neg = sum(texte.count(mot) for mot in lexique_negatif)

    score, subj, label = get_sentiment(texte)
    articles.append({
        "Libelle_Long": entreprise,
        "Date_seance": date,
        "score_analysis": score,
        "subjectivity": subj,
        "sentiment_analysis": label,
        "Mentions_Positives": nb_pos,
        "Mentions_Négatives": nb_neg
    })

df_articles = pd.DataFrame(articles)
df_articles["Date_seance"] = pd.to_datetime(df_articles["Date_seance"])

# === 6. Export des scores individuels ===
df_articles.to_csv("scores_par_article.csv", index=False, encoding="utf-8-sig")
df_articles.to_excel("scores_par_article.xlsx", index=False)

# === 7. Agrégation par séance ===
df_sentiments = df_articles.groupby(["Libelle_Long", "Date_seance"]).agg({
    "score_analysis": ["mean", "var"],
    "subjectivity": "mean",
    "sentiment_analysis": "count"
}).reset_index()
df_sentiments.columns = [
    "Libelle_Long", "Date_seance", "Sentiment_Moyen", "Volatilite_Sentiment",
    "Subjectivite_Moyenne", "nb_articles_par_seance"
]

# === 8. Liste des scores et mentions par séance ===
df_scores = df_articles.groupby(["Libelle_Long", "Date_seance"]).agg({
    "score_analysis": list,
    "Mentions_Positives": "sum",
    "Mentions_Négatives": "sum"
}).reset_index()
df_scores.rename(columns={"score_analysis": "liste_scores_par_article"}, inplace=True)

# === 9. Charger les indicateurs techniques ===
df_indic = pd.read_csv("indicateurs_techniques.csv")
df_indic["Libelle_Long"] = df_indic["Libelle_Long"].str.strip().str.upper()
df_indic["Date_seance"] = pd.to_datetime(df_indic["Date_seance"])

# === 10. Fusion globale ===
df_final = pd.merge(df_indic, df_sentiments, on=["Libelle_Long", "Date_seance"], how="left")
df_final = pd.merge(df_final, df_scores, on=["Libelle_Long", "Date_seance"], how="left")

# === 11. Nettoyage
df_final["Sentiment_Moyen"] = df_final["Sentiment_Moyen"].fillna(0.0)
df_final["Volatilite_Sentiment"] = df_final["Volatilite_Sentiment"].fillna(0.0)
df_final["Subjectivite_Moyenne"] = df_final["Subjectivite_Moyenne"].fillna(0.0)
df_final["nb_articles_par_seance"] = df_final["nb_articles_par_seance"].fillna(0).astype(int)
df_final["Mentions_Positives"] = df_final["Mentions_Positives"].fillna(0).astype(int)
df_final["Mentions_Négatives"] = df_final["Mentions_Négatives"].fillna(0).astype(int)
df_final["liste_scores_par_article"] = df_final["liste_scores_par_article"].apply(lambda x: eval(x) if isinstance(x, str) else x)
df_final["liste_scores_par_article"] = df_final["liste_scores_par_article"].fillna("").apply(lambda x: [] if x == "" else x)

# === 12. Sentiment global + variation + tendance ===
def classify_sentiment(score):
    if score > 0.05:
        return "POS"
    elif score < -0.05:
        return "NEG"
    else:
        return "NEU"

df_final["sentiment_analysis"] = df_final["Sentiment_Moyen"].apply(classify_sentiment)

df_final = df_final.sort_values(["Libelle_Long", "Date_seance"])
df_final["Sentiment_Mobile_3j"] = df_final.groupby("Libelle_Long")["Sentiment_Moyen"].transform(lambda x: x.rolling(3, min_periods=1).mean())
df_final["Variation_Sentiment_J-1"] = df_final.groupby("Libelle_Long")["Sentiment_Moyen"].diff()

def tendance(row):
    if pd.isna(row["Variation_Sentiment_J-1"]):
        return "STABLE"
    elif row["Variation_Sentiment_J-1"] > 0.02:
        return "HAUSSE"
    elif row["Variation_Sentiment_J-1"] < -0.02:
        return "BAISSE"
    else:
        return "STABLE"

df_final["Tendance_Sentiment"] = df_final.apply(tendance, axis=1)

# === 13. Export complet
df_final.to_csv("dataset_bourse_complet.csv", index=False, encoding="utf-8-sig")
df_final.to_excel("dataset_bourse_complet.xlsx", index=False)

print("✅ Fichier final prêt : dataset_bourse_complet.csv/.xlsx")


100%|██████████| 7386/7386 [23:01<00:00,  5.35it/s]


✅ Fichier final prêt : dataset_bourse_complet.csv/.xlsx
