In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
from newsapi import NewsApiClient
from datetime import datetime, timedelta
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from random import sample
import spacy
from gdeltdoc import GdeltDoc, Filters
from newspaper import Article
import pandas as pd
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from concurrent.futures import ThreadPoolExecutor





  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# -------- Setter --------
ASSET = "BTC-USD"  # what asset to analyze fron Yahoo Finance
QUERIES = {
     "bitcoin","crypto","cryptocurrency","ethereum","blockchain",
    "binance","coinbase","solana","xrp","defi","web3","nft",
    "mining","hashrate","regulation","hack","exchange","stablecoin"
}



  # what to search in news from NewsAPI
N_DAYS = 30  # nb days for analysis
NEWSAPI_KEY = "5bcdfd2037114aa591e1cff0ccd93b81" 

# -------- 1. Get PRIX --------
end = datetime.utcnow()
start = end - timedelta(days=N_DAYS + 5)  # margin of 5 days to ensure enough data

prices = yf.download(ASSET, start=start, end=end, interval="1d", group_by="column")

if isinstance(prices.columns, pd.MultiIndex):
    prices.columns = prices.columns.get_level_values(0)
    
    
prices = prices[["Close"]].rename(columns={"Close": "price"})
prices["return"] = prices["price"].pct_change()

prices = prices.reset_index()    
prices["date"] = prices["Date"].dt.normalize()
prices_daily = prices[["date", "price", "return"]]

print(prices.head())


  end = datetime.utcnow()
[*********************100%***********************]  1 of 1 completed

Price       Date          price    return       date
0     2025-10-26  114472.445312       NaN 2025-10-26
1     2025-10-27  114119.328125 -0.003085 2025-10-27
2     2025-10-28  112956.164062 -0.010193 2025-10-28
3     2025-10-29  110055.304688 -0.025681 2025-10-29
4     2025-10-30  108305.546875 -0.015899 2025-10-30





In [4]:
#-------- 2. Get news from GDELT --------

gd = GdeltDoc()

start_date = (end - timedelta(days=N_DAYS)).strftime("%Y-%m-%d")
end_date   = end.strftime("%Y-%m-%d")

all_news = []

for q in QUERIES:
  

    f = Filters(
        keyword=q,
        start_date=start_date,
        end_date=end_date,
        num_records=250,
        language="English"
    )

    try:
        df_q = gd.article_search(f)
    except ValueError as e:
        print(f"Skip '{q}' → {e}")
        continue

    if df_q is None or df_q.empty:
        print(f"{q}: 0 articles")
        continue

    df_q["query"] = q
    print(f"{q}: {len(df_q)} articles")
    all_news.append(df_q)

news_raw = pd.concat(all_news, ignore_index=True)
news_raw = news_raw.drop_duplicates(subset="url")
print("Unique URLs:", len(news_raw))



bitcoin: 250 articles
cryptocurrency: 250 articles
cryptocurrency: 250 articles
ethereum: 250 articles
ethereum: 250 articles
Skip 'nft' → The query was not valid. The API error message was: The specified phrase is too short.
Skip 'nft' → The query was not valid. The API error message was: The specified phrase is too short.
solana: 250 articles
solana: 250 articles
regulation: 250 articles
regulation: 250 articles
crypto: 250 articles
crypto: 250 articles
Skip 'web3' → The query was not valid. The API error message was: The specified phrase is too short.
Skip 'web3' → The query was not valid. The API error message was: The specified phrase is too short.
Skip 'defi' → The query was not valid. The API error message was: The specified phrase is too short.
Skip 'defi' → The query was not valid. The API error message was: The specified phrase is too short.
Skip 'xrp' → The query was not valid. The API error message was: The specified phrase is too short.
Skip 'xrp' → The query was not valid

In [5]:
#-------- 3. Scrape full texts --------

def fetch_article(url):
    try:
        article = Article(url, language="en")
        article.download()
        article.parse()
        txt = article.text
        return txt if len(txt) > 100 else None
    except:
        return None

urls_sample = news_raw["url"].dropna().unique()[:100]  # 300 premiers

with ThreadPoolExecutor(max_workers=10) as ex:
    fulltexts = list(ex.map(fetch_article, urls_sample))

url_to_text = dict(zip(urls_sample, fulltexts))

news_raw["fulltext"] = news_raw["url"].map(url_to_text)

news_clean = news_raw.dropna(subset=["fulltext"])
print("Articles scraped:", len(news_clean))
print(news_clean["fulltext"].head())

Articles scraped: 95
0    PHOENIX, Oct. 31, 2025 (GLOBE NEWSWIRE) -- Fol...
1    “I just got some bitcoin! Have you heard about...
2    Strategy Max Keiser. Photo by BeInCrypto\n\nSt...
3    Bitcoin Hyper: Best Crypto to Buy as Scandinav...
4    The Bitcoin whitepaper, A Peer-to-Peer Electro...
Name: fulltext, dtype: object


In [6]:
#-------- 5. Text cleaning with SpaCy --------

nlp = spacy.load("en_core_web_sm", disable=["ner","parser"])

def clean_text_spacy(txt):
    doc = nlp(txt.lower())
    return " ".join([
        tok.lemma_
        for tok in doc
        if tok.is_alpha and not tok.is_stop and len(tok) > 2
    ])

news_clean["clean_text"] = news_clean["fulltext"].apply(clean_text_spacy)


In [7]:
docs = news_clean["clean_text"].tolist()

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)

topics, probs = topic_model.fit_transform(docs)

news_clean["topic"] = topics
news_clean["topic_probmax"] = [
    float(np.max(p)) if p is not None else np.nan for p in probs
]

print("haha",news_clean["topic"].value_counts())

2025-11-30 02:05:32,858 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 3/3 [00:03<00:00,  1.18s/it]
2025-11-30 02:05:39,583 - BERTopic - Embedding - Completed ✓
2025-11-30 02:05:39,584 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
Batches: 100%|██████████| 3/3 [00:03<00:00,  1.18s/it]
2025-11-30 02:05:39,583 - BERTopic - Embedding - Completed ✓
2025-11-30 02:05:39,584 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-30 02:05:55,289 - BERTopic - Dimensionality - Completed ✓
2025-11-30 02:05:55,292 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-30 02:05:55,289 - BERTopic - Dimensionality - Completed ✓
2025-11-30 02:05:55,292 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-30 02:05:55,324 - BERTopic - Cluster - Completed ✓
2025-11-30 02:05:55,334 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-30 0

haha topic
 0    64
-1    17
 1    14
Name: count, dtype: int64


In [8]:

news_df = news_clean.copy()

# nettoyer et convertir la date
news_df["datetime"] = pd.to_datetime(news_df["seendate"], format="%Y%m%dT%H%M%SZ")
news_df["date"] = news_df["datetime"].dt.date

# renommer proprement
news_df = news_clean.copy()

news_df["datetime"] = pd.to_datetime(news_df["seendate"], format="%Y%m%dT%H%M%SZ")
news_df["date"] = news_df["datetime"].dt.date

news_df = news_df.rename(columns={
    "domain": "source",
    "fulltext": "text",
})


news_df = news_df[["datetime", "date", "source", "title", "text", "clean_text", "topic", "topic_probmax"]]

print(news_df.head())
print("Nombre final d'articles :", len(news_df))
print(topic_model.get_topic_info())

df_time = news_df[["clean_text", "datetime", "topic"]].rename(columns={
    "clean_text": "Document",
    "datetime": "Timestamp",
    "topic": "Topic",
})

topics_over_time = topic_model.topics_over_time(
    docs=df_time["Document"].tolist(),
    topics=df_time["Topic"].tolist(),
    timestamps=df_time["Timestamp"].tolist()
)


             datetime        date             source  \
0 2025-10-31 14:30:00  2025-10-31  globenewswire.com   
1 2025-11-08 13:15:00  2025-11-08      newyorker.com   
2 2025-11-01 07:45:00  2025-11-01  finance.yahoo.com   
3 2025-10-31 15:45:00  2025-10-31   bravenewcoin.com   
4 2025-11-01 17:15:00  2025-11-01       coindesk.com   

                                               title  \
0  Fold and Steak  n Shake Bring Bitcoin Rewards ...   
1  What Conversations About Bitcoin Sound Like to Me   
2  Strategy Reports $2 . 8B Q3 Profit , Bitcoin T...   
3  Bitcoin Hyper as Best Crypto to Buy ? Scandina...   
4          BTC Whitepaper Published This Day in 2008   

                                                text  \
0  PHOENIX, Oct. 31, 2025 (GLOBE NEWSWIRE) -- Fol...   
1  “I just got some bitcoin! Have you heard about...   
2  Strategy Max Keiser. Photo by BeInCrypto\n\nSt...   
3  Bitcoin Hyper: Best Crypto to Buy as Scandinav...   
4  The Bitcoin whitepaper, A Peer-to-Peer Elec

83it [00:01, 82.96it/s]
83it [00:01, 82.96it/s]


In [9]:
# -------- 3. Evaluiation of Bertopic's cluster --------

topics_words = []
for topic_id in set(topics):
    if topic_id == -1: continue
    words = [w for w, _ in topic_model.get_topic(topic_id)]
    topics_words.append(words)
    
# mesure how words in topics are related
docs_tokenized = [doc.split() for doc in news_df["text"]]


MAX_DOCS = 300
if len(docs_tokenized) > MAX_DOCS:
    docs_tokenized_sample = sample(docs_tokenized, MAX_DOCS)
else:
    docs_tokenized_sample = docs_tokenized

dictionary = Dictionary(docs_tokenized)

#c_v most used coherence measure 
cm = CoherenceModel(
    topics=topics_words,
    texts=docs_tokenized,
    dictionary=dictionary,
    coherence='c_v',
    processes=4
)

coherence = cm.get_coherence()
print("Topic Coherence:", coherence)

#HDBSCAN cluster stability score (more stable clusters have higher persistence)

clusterer = topic_model.hdbscan_model
stabilities = clusterer.cluster_persistence_
print(stabilities)

# doesn't work because there are few topics topic_model.visualize_topics()

topic_model.visualize_topics_over_time(topics_over_time)
topic_model.visualize_barchart(top_n_topics=10)
topic_model.visualize_heatmap()


Topic Coherence: 0.44376931500037947
[0.2399409  0.32289267]


In [10]:
#-------- 4. Sentiment analysis with FinBERT --------

# Charger le modèle FinBERT
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Fonction sentiment
def finbert_sentiment(text):
    try:
        # éviter erreurs sur None ou texte trop court
        if not isinstance(text, str) or len(text.strip()) == 0:
            return (0, 1, 0)

        inputs = tokenizer(
            text[:512],                # tronquer pour éviter erreurs GPU/CPU
            return_tensors="pt",
            truncation=True
        )
        with torch.no_grad():
            logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=1).numpy()[0]

        return float(probs[0]), float(probs[1]), float(probs[2])  # neg, neu, pos

    except Exception as e:
        print("Error processing text:", e)
        return (0, 1, 0)

# Appliquer sur ton DataFrame
sentiments = news_df["text"].apply(finbert_sentiment)

news_df["sent_neg"] = sentiments.apply(lambda x: x[0])
news_df["sent_neu"] = sentiments.apply(lambda x: x[1])
news_df["sent_pos"] = sentiments.apply(lambda x: x[2])



# ========= 1) FEATURES POUR LE RL =========

# Intensité du sentiment (plus c’est loin de neutre, plus c’est fort)
news_df["sent_dir"] = news_df["sent_pos"] - news_df["sent_neg"]

# Longueur normalisée du texte
news_df["length_norm"] = news_df["text"].str.len() / news_df["text"].str.len().max()

# Poids de la source (auto : moins un média publie souvent, plus il pèse)
source_freq = news_df["source"].value_counts()
source_importance = 1 / (1 + source_freq)
news_df["source_weight"] = news_df["source"].map(source_importance).fillna(0.5)

# Poids de récence
news_df["hours_since"] = (datetime.utcnow() - news_df["datetime"]).dt.total_seconds() / 3600
news_df["recency_weight"] = np.exp(- news_df["hours_since"] / 24)  # demi-vie ≈ 24h

# ========= 2) BOUCLE RL SUR LES PARAMS =========

# param initiaux
params = {
    "ws": 1.0,  # poids sent_strength
    "wt": 1.0,  # poids topic_probmax
    "wr": 1.0,  # poids recency_weight
    "wl": 1.0,  # poids length_norm
    "wb": 1.0   # poids source_weight
}

def compute_rl_sentiment(df, params):
    return (
        (1  +
        params["wt"] * df["topic_probmax"] +
        params["wr"] * df["recency_weight"] +
        params["wl"] * df["length_norm"] +
        params["wb"] * df["source_weight"]) * df["sent_dir"]
    )

lr = 0.01  # learning rate

# On s'assure que prices_daily['date'] est en datetime (comme news_df['date'])
prices_daily["date"] = pd.to_datetime(prices_daily["date"])
news_df["date"] = pd.to_datetime(news_df["date"])

for epoch in range(200):
    # 1) score par article
    news_df["sent_rl"] = compute_rl_sentiment(news_df, params)

    # 2) agrégation par jour
    sent_daily = (
        news_df
        .groupby("date")["sent_rl"]
        .sum()
        .reset_index()   # colonnes: date, sent_rl
    )

    # 3) merge avec les returns
    df_tmp = pd.merge(
        prices_daily[["date", "return"]],
        sent_daily,
        on="date",
        how="inner"
    )

    # 4) corrélation avec le return du lendemain
    corr = df_tmp["sent_rl"].shift(1).corr(df_tmp["return"])

    if pd.isna(corr):
        continue  # si pas assez de données pour corr, on skip

    # 5) "gradient ascent" : on pousse tous les params dans le sens de la corr
    for k in params:
        params[k] += lr * corr

print("params optimisés:", params)

# ========= 3) SCORE FINAL PAR ARTICLE ET PAR JOUR =========

# Recalcul une fois avec les params optimisés
news_df["sent_rl"] = compute_rl_sentiment(news_df, params)

# Par jour (somme = volume + intensité)
sent_daily_rl = (
    news_df
    .groupby("date")["sent_rl"]
    .sum()
    .reset_index()
)

print(sent_daily_rl.head())


print(news_df[["title", "sent_neg", "sent_neu", "sent_pos", "sent_rl"]].head())



datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).



params optimisés: {'ws': np.float64(1.1687022647901646), 'wt': np.float64(1.1687022647901646), 'wr': np.float64(1.1687022647901646), 'wl': np.float64(1.1687022647901646), 'wb': np.float64(1.1687022647901646)}
        date   sent_rl
0 2025-10-31 -2.943720
1 2025-11-01  1.628635
2 2025-11-02 -0.086572
3 2025-11-03 -2.357726
4 2025-11-04 -1.049533
                                               title  sent_neg  sent_neu  \
0  Fold and Steak  n Shake Bring Bitcoin Rewards ...  0.781254  0.007342   
1  What Conversations About Bitcoin Sound Like to Me  0.035933  0.038294   
2  Strategy Reports $2 . 8B Q3 Profit , Bitcoin T...  0.951961  0.021705   
3  Bitcoin Hyper as Best Crypto to Buy ? Scandina...  0.577928  0.049784   
4          BTC Whitepaper Published This Day in 2008  0.043647  0.022269   

   sent_pos   sent_rl  
0  0.211404 -1.206961  
1  0.925774  2.246670  
2  0.026334 -2.307645  
3  0.372289 -0.414515  
4  0.934084  2.177537  


In [11]:
# 1) Nombre de news par topic et par jour
topic_daily_counts = (
    news_df
    .groupby(["date", "topic"])
    .size()
    .reset_index(name="count")
)

# 2) Pivot: lignes = dates, colonnes = topics
topic_daily_pivot = topic_daily_counts.pivot(
    index="date", columns="topic", values="count"
).fillna(0)

topic_daily_pivot.columns = [f"topic_{c}" for c in topic_daily_pivot.columns]
topic_daily_pivot.index = pd.to_datetime(topic_daily_pivot.index)
topic_daily_pivot = topic_daily_pivot.reset_index()   # 'date' redevient une colonne

# 3) Sentiment moyen par jour
topic_daily_pivot["date"] = pd.to_datetime(topic_daily_pivot["date"])
sent_daily_rl["date"] = pd.to_datetime(sent_daily_rl["date"])

# 4) Regrouper topics + sentiment RL
daily_features = pd.merge(
    topic_daily_pivot,
    sent_daily_rl,      
    on="date",
    how="left"
).fillna(0)

# 5) Merge avec les prix
prices_daily["date"] = pd.to_datetime(prices_daily["date"]).dt.date
daily_features["date"] = pd.to_datetime(daily_features["date"]).dt.date

df_join = pd.merge(
    prices_daily,
    daily_features,
    on="date",
    how="left"
).fillna(0)

print(df_join.head(10))

# 6) Corrélations avec le retour
corr = df_join.corr(numeric_only=True)["return"].sort_values(ascending=False)
print(corr)

topic_id = 3
print(topic_model.get_topic(topic_id))


         date          price    return  topic_-1  topic_0  topic_1   sent_rl
0  2025-10-26  114472.445312  0.000000       0.0      0.0      0.0  0.000000
1  2025-10-27  114119.328125 -0.003085       0.0      0.0      0.0  0.000000
2  2025-10-28  112956.164062 -0.010193       0.0      0.0      0.0  0.000000
3  2025-10-29  110055.304688 -0.025681       0.0      0.0      0.0  0.000000
4  2025-10-30  108305.546875 -0.015899       0.0      0.0      0.0  0.000000
5  2025-10-31  109556.164062  0.011547       5.0      7.0      3.0 -2.943720
6  2025-11-01  110064.015625  0.004636       5.0     11.0      3.0  1.628635
7  2025-11-02  110639.625000  0.005230       0.0      3.0      0.0 -0.086572
8  2025-11-03  106547.523438 -0.036986       0.0      3.0      1.0 -2.357726
9  2025-11-04  101590.523438 -0.046524       1.0      5.0      0.0 -1.049533
return      1.000000
topic_-1    0.135314
topic_1     0.109540
topic_0     0.071356
price       0.023667
sent_rl    -0.059242
Name: return, dtype: float6


datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).



params optimisés: {'ws': np.float64(1.2173597676766725), 'wt': np.float64(1.2173597676766725), 'wr': np.float64(1.2173597676766725), 'wl': np.float64(1.2173597676766725), 'wb': np.float64(1.2173597676766725)}
        date    sent_rl
0 2025-10-30  13.438864
1 2025-10-31  22.010162
2 2025-11-01  25.470635
3 2025-11-02   9.018230
4 2025-11-03   2.058932
