In [35]:
import pandas as pd
import numpy as np
import yfinance as yf
from newsapi import NewsApiClient
from datetime import datetime, timedelta
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from random import sample
import spacy
from gdeltdoc import GdeltDoc, Filters
from newspaper import Article
import pandas as pd
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [None]:
# -------- Settings --------
ASSET = "BTC-USD"  # asset to analyze from Yahoo Finance
QUERIES = {
     "bitcoin","crypto","cryptocurrency","ethereum","blockchain",
    "binance","coinbase","solana","xrp","defi","web3","nft",
    "mining","hashrate","regulation","hack","exchange","stablecoin"
}



  # keywords to search for in news sources
N_DAYS = 30  # number of days to analyze
NEWSAPI_KEY = "5bcdfd2037114aa591e1cff0ccd93b81" 

# -------- 1. Get Prices --------
end = datetime.utcnow()
start = end - timedelta(days=N_DAYS + 5)  # margin of 5 days to ensure enough data

prices = yf.download(ASSET, start=start, end=end, interval="1d", group_by="column")

if isinstance(prices.columns, pd.MultiIndex):
    prices.columns = prices.columns.get_level_values(0)
    
    
prices = prices[["Close"]].rename(columns={"Close": "price"})
prices["return"] = prices["price"].pct_change()

prices = prices.reset_index()    
prices["date"] = prices["Date"].dt.normalize()
prices_daily = prices[["date", "price", "return"]]

print(prices.head())


  end = datetime.utcnow()
[*********************100%***********************]  1 of 1 completed

Price       Date          price    return       date
0     2025-10-26  114472.445312       NaN 2025-10-26
1     2025-10-27  114119.328125 -0.003085 2025-10-27
2     2025-10-28  112956.164062 -0.010193 2025-10-28
3     2025-10-29  110055.304688 -0.025681 2025-10-29
4     2025-10-30  108305.546875 -0.015899 2025-10-30





In [14]:
#-------- 2. Get news from GDELT --------

gd = GdeltDoc()

start_date = (end - timedelta(days=N_DAYS)).strftime("%Y-%m-%d")
end_date   = end.strftime("%Y-%m-%d")

all_news = []

for q in QUERIES:
  

    f = Filters(
        keyword=q,
        start_date=start_date,
        end_date=end_date,
        num_records=250,
        language="English"
    )

    try:
        df_q = gd.article_search(f)
    except ValueError as e:
        print(f"Skip '{q}' → {e}")
        continue

    if df_q is None or df_q.empty:
        print(f"{q}: 0 articles")
        continue

    df_q["query"] = q
    print(f"{q}: {len(df_q)} articles")
    all_news.append(df_q)

news_raw = pd.concat(all_news, ignore_index=True)
news_raw = news_raw.drop_duplicates(subset="url")
print("Unique URLs:", len(news_raw))



blockchain: 250 articles
Skip 'nft' → The query was not valid. The API error message was: The specified phrase is too short.
regulation: 250 articles
Skip 'web3' → The query was not valid. The API error message was: The specified phrase is too short.
stablecoin: 250 articles
binance: 250 articles
mining: 250 articles
Skip 'hack' → The query was not valid. The API error message was: The specified phrase is too short.
bitcoin: 250 articles
exchange: 250 articles
Skip 'defi' → The query was not valid. The API error message was: The specified phrase is too short.
crypto: 250 articles
coinbase: 250 articles
hashrate: 130 articles
Skip 'xrp' → The query was not valid. The API error message was: The specified phrase is too short.
solana: 250 articles
cryptocurrency: 250 articles
ethereum: 250 articles
Unique URLs: 2842


In [None]:
#-------- 3. Scrape full texts --------

def fetch_article(url):
    try:
        article = Article(url, language="en")
        article.download()
        article.parse()
        txt = article.text
        return txt if len(txt) > 100 else None
    except:
        return None

urls_sample = news_raw["url"].dropna().unique()[:100]  # first 100 URLs

with ThreadPoolExecutor(max_workers=10) as ex:
    fulltexts = list(ex.map(fetch_article, urls_sample))

url_to_text = dict(zip(urls_sample, fulltexts))

news_raw["fulltext"] = news_raw["url"].map(url_to_text)

news_clean = news_raw.dropna(subset=["fulltext"])
print("Articles scraped:", len(news_clean))
print(news_clean["fulltext"].head())



Articles scraped: 73
0    COPENHAGEN, Denmark, Nov. 17, 2025 /PRNewswire...
1    New Delhi [India], November 17: Digital South ...
4    London, Nov. 06, 2025 (GLOBE NEWSWIRE) --\n\n\...
5    This content is provided by a sponsor\n\nAbu D...
6    Binance Mumbai Blockchain Yatra 2025\n\nIndia’...
Name: fulltext, dtype: object


In [16]:
#-------- 5. Text cleaning with SpaCy --------

nlp = spacy.load("en_core_web_sm", disable=["ner","parser"])

def clean_text_spacy(txt):
    doc = nlp(txt.lower())
    return " ".join([
        tok.lemma_
        for tok in doc
        if tok.is_alpha and not tok.is_stop and len(tok) > 2
    ])

news_clean["clean_text"] = news_clean["fulltext"].apply(clean_text_spacy)


In [17]:
docs = news_clean["clean_text"].tolist()

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)

topics, probs = topic_model.fit_transform(docs)

news_clean["topic"] = topics
news_clean["topic_probmax"] = [
    float(np.max(p)) if p is not None else np.nan for p in probs
]

print("haha",news_clean["topic"].value_counts())

2025-11-30 02:37:25,539 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 3/3 [00:02<00:00,  1.21it/s]
2025-11-30 02:37:30,072 - BERTopic - Embedding - Completed ✓
2025-11-30 02:37:30,073 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-30 02:37:30,155 - BERTopic - Dimensionality - Completed ✓
2025-11-30 02:37:30,157 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-30 02:37:30,176 - BERTopic - Cluster - Completed ✓
2025-11-30 02:37:30,181 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-30 02:37:30,899 - BERTopic - Representation - Completed ✓


haha topic
-1    45
 0    16
 1    12
Name: count, dtype: int64


In [None]:

news_df = news_clean.copy()

# Clean and convert the date field
news_df["datetime"] = pd.to_datetime(news_df["seendate"], format="%Y%m%dT%H%M%SZ")
news_df["date"] = news_df["datetime"].dt.date

# Rename columns for clarity
news_df = news_clean.copy()

news_df["datetime"] = pd.to_datetime(news_df["seendate"], format="%Y%m%dT%H%M%SZ")
news_df["date"] = news_df["datetime"].dt.date

news_df = news_df.rename(columns={
    "domain": "source",
    "fulltext": "text",
})


news_df = news_df[["datetime", "date", "source", "title", "text", "clean_text", "topic", "topic_probmax"]]

print(topic_model.get_topic_info())

df_time = news_df[["clean_text", "datetime", "topic"]].rename(columns={
    "clean_text": "Document",
    "datetime": "Timestamp",
    "topic": "Topic",
})

topics_over_time = topic_model.topics_over_time(
    docs=df_time["Document"].tolist(),
    topics=df_time["Topic"].tolist(),
    timestamps=df_time["Timestamp"].tolist()
)


   Topic  Count                                  Name  \
0     -1     45   -1_blockchain_market_digital_global   
1      0     16  0_payment_blockchain_stablecoin_user   
2      1     12   1_blockchain_digital_asset_platform   

                                      Representation  \
0  [blockchain, market, digital, global, technolo...   
1  [payment, blockchain, stablecoin, user, bank, ...   
2  [blockchain, digital, asset, platform, credit,...   

                                 Representative_Docs  
0  [pnn dubai uae november mark new era vision me...  
1  [alibaba partner jpmorgan launch tokenized pay...  
2  [lcpc launch drive blockchain platform intelli...  


70it [00:00, 89.30it/s]


In [None]:
# -------- 3. Evaluation of BERTopic clusters --------

topics_words = []
for topic_id in set(topics):
    if topic_id == -1: continue
    words = [w for w, _ in topic_model.get_topic(topic_id)]
    topics_words.append(words)
    
# Measure how words within topics relate to each other
docs_tokenized = [doc.split() for doc in news_df["text"]]


MAX_DOCS = 300
if len(docs_tokenized) > MAX_DOCS:
    docs_tokenized_sample = sample(docs_tokenized, MAX_DOCS)
else:
    docs_tokenized_sample = docs_tokenized

dictionary = Dictionary(docs_tokenized)

# 'c_v' is a commonly used topic coherence measure
cm = CoherenceModel(
    topics=topics_words,
    texts=docs_tokenized,
    dictionary=dictionary,
    coherence='c_v',
    processes=4
)

coherence = cm.get_coherence()
print("Topic Coherence:", coherence)

# HDBSCAN cluster stability (cluster persistence: higher means more stable)

clusterer = topic_model.hdbscan_model
stabilities = clusterer.cluster_persistence_
print(stabilities)

# Some visualizations may not work if there are too few topics

topic_model.visualize_topics_over_time(topics_over_time)
topic_model.visualize_barchart(top_n_topics=10)
topic_model.visualize_heatmap()


Topic Coherence: 0.34113429785470195
[0.05195506 0.02970905]


In [None]:
#-------- 4. Sentiment analysis with FinBERT --------

# Load the FinBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Define a helper to compute sentiment probabilities using FinBERT
def finbert_sentiment(text):
    try:
        # avoid errors for None or very short texts
        if not isinstance(text, str) or len(text.strip()) == 0:
            return (0, 1, 0)

        inputs = tokenizer(
            text[:512],                # truncate to avoid GPU/CPU errors
            return_tensors="pt",
            truncation=True
        )
        with torch.no_grad():
            logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=1).numpy()[0]

        return float(probs[0]), float(probs[1]), float(probs[2])  # neg, neu, pos

    except Exception as e:
        print("Error processing text:", e)
        return (0, 1, 0)

# Apply the sentiment function to each article text
sentiments = news_df["text"].apply(finbert_sentiment)

news_df["sent_neg"] = sentiments.apply(lambda x: x[0])
news_df["sent_neu"] = sentiments.apply(lambda x: x[1])
news_df["sent_pos"] = sentiments.apply(lambda x: x[2])



# ========= 1) FEATURES FOR REINFORCEMENT / SCORING =========

# Sentiment intensity (distance from neutral; larger magnitude = stronger sentiment)
news_df["sent_dir"] = news_df["sent_pos"] - news_df["sent_neg"]

# Normalized text length
news_df["length_norm"] = news_df["text"].str.len() / news_df["text"].str.len().max()

# Source weight: less frequent sources receive higher relative weight
source_freq = news_df["source"].value_counts()
source_importance = 1 / (1 + source_freq)
news_df["source_weight"] = news_df["source"].map(source_importance).fillna(0.5)

# Recency weight: exponential decay (half-life ≈ 24h)
news_df["hours_since"] = (datetime.utcnow() - news_df["datetime"]).dt.total_seconds() / 3600
news_df["recency_weight"] = np.exp(- news_df["hours_since"] / 24)  # half-life ≈ 24h

# ========= 2) RL LOOP TO OPTIMIZE FEATURE WEIGHTS =========

# initial parameters (weights for the feature components)
params = {
    "ws": 1.0,  # weight for sentiment strength
    "wt": 1.0,  # weight for topic probability max
    "wr": 1.0,  # weight for recency
    "wl": 1.0,  # weight for text length
    "wb": 1.0   # weight for source importance
}

def compute_rl_sentiment(df, params):
    return (
        (1  +
        params["wt"] * df["topic_probmax"] +
        params["wr"] * df["recency_weight"] +
        params["wl"] * df["length_norm"] +
        params["wb"] * df["source_weight"]) * df["sent_dir"]
    )

lr = 0.01  # learning rate for the simple parameter update

# Ensure `prices_daily['date']` and `news_df['date']` are datetime
prices_daily["date"] = pd.to_datetime(prices_daily["date"])
news_df["date"] = pd.to_datetime(news_df["date"])

for epoch in range(200):
    # 1) compute per-article RL score
    news_df["sent_rl"] = compute_rl_sentiment(news_df, params)

    # 2) aggregate scores by day (sum = volume * intensity)
    sent_daily = (
        news_df
        .groupby("date")["sent_rl"]
        .sum()
        .reset_index()   # columns: date, sent_rl
    )

    # 3) merge daily sentiment with price returns
    df_tmp = pd.merge(
        prices_daily[["date", "return"]],
        sent_daily,
        on="date",
        how="inner"
    )

    # 4) compute correlation between yesterday's sentiment and today's return
    corr = df_tmp["sent_rl"].shift(1).corr(df_tmp["return"])

    if pd.isna(corr):
        continue  # skip if there is not enough data to compute correlation

    # 5) simple 'gradient ascent': nudge all params in direction of the correlation
    for k in params:
        params[k] += lr * corr

print("optimized params:", params)

# ========= 3) FINAL SCORE PER ARTICLE AND PER DAY =========

# Recompute once with the optimized parameters
news_df["sent_rl"] = compute_rl_sentiment(news_df, params)

# Per day (sum = volume * intensity)
sent_daily_rl = (
    news_df
    .groupby("date")["sent_rl"]
    .sum()
    .reset_index()
)

print(sent_daily_rl.head())


print(news_df[["title", "sent_neg", "sent_neu", "sent_pos", "sent_rl"]].head())



datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).



params optimisés: {'ws': np.float64(0.8369760451530115), 'wt': np.float64(0.8369760451530115), 'wr': np.float64(0.8369760451530115), 'wl': np.float64(0.8369760451530115), 'wb': np.float64(0.8369760451530115)}
        date   sent_rl
0 2025-10-31 -4.047515
1 2025-11-01  0.896683
2 2025-11-02  4.138598
3 2025-11-03 -1.116493
4 2025-11-04 -1.979591
                                               title  sent_neg  sent_neu  \
0  Blockchain for Good Alliance ( BGA ) Recognize...  0.714756  0.009995   
1  Digital South Trust Launches India First & Lar...  0.857132  0.009265   
4  Credit Blockchain Launches Next - Generation A...  0.638664  0.011034   
5  Global Blockchain Show 2025 by VAP Group to be...  0.188980  0.008485   
6  Mumbai blockchain moment : Binance Yatra maps ...  0.120096  0.009419   

   sent_pos   sent_rl  
0  0.275249 -0.899104  
1  0.133603 -1.357305  
4  0.350302 -0.664776  
5  0.802535  1.086591  
6  0.870485  1.746366  


In [None]:
# 1) Number of news per topic per day
topic_daily_counts = (
    news_df
    .groupby(["date", "topic"])
    .size()
    .reset_index(name="count")
)

# 2) Pivot: rows = dates, columns = topics
topic_daily_pivot = topic_daily_counts.pivot(
    index="date", columns="topic", values="count"
).fillna(0)

topic_daily_pivot.columns = [f"topic_{c}" for c in topic_daily_pivot.columns]
topic_daily_pivot.index = pd.to_datetime(topic_daily_pivot.index)
topic_daily_pivot = topic_daily_pivot.reset_index()   # 'date' becomes a column again

# 3) Daily aggregated RL sentiment
topic_daily_pivot["date"] = pd.to_datetime(topic_daily_pivot["date"])
sent_daily_rl["date"] = pd.to_datetime(sent_daily_rl["date"])

# 4) Combine topic counts with RL sentiment features
daily_features = pd.merge(
    topic_daily_pivot,
    sent_daily_rl,      
    on="date",
    how="left"
).fillna(0)

# 5) Merge with price data
prices_daily["date"] = pd.to_datetime(prices_daily["date"]).dt.date
daily_features["date"] = pd.to_datetime(daily_features["date"]).dt.date

df_join = pd.merge(
    prices_daily,
    daily_features,
    on="date",
    how="left"
).fillna(0)

print(df_join.head(10))

# 6) Correlations with return
corr = df_join.corr(numeric_only=True)["return"].sort_values(ascending=False)
print(corr)

topic_id = 3
print(topic_model.get_topic(topic_id))


         date          price    return  topic_-1  topic_0  topic_1   sent_rl
0  2025-10-26  114472.445312  0.000000       0.0      0.0      0.0  0.000000
1  2025-10-27  114119.328125 -0.003085       0.0      0.0      0.0  0.000000
2  2025-10-28  112956.164062 -0.010193       0.0      0.0      0.0  0.000000
3  2025-10-29  110055.304688 -0.025681       0.0      0.0      0.0  0.000000
4  2025-10-30  108305.546875 -0.015899       0.0      0.0      0.0  0.000000
5  2025-10-31  109556.164062  0.011547       2.0      0.0      1.0 -4.047515
6  2025-11-01  110064.015625  0.004636       0.0      1.0      0.0  0.896683
7  2025-11-02  110639.625000  0.005230       2.0      2.0      0.0  4.138598
8  2025-11-03  106547.523438 -0.036986       1.0      0.0      0.0 -1.116493
9  2025-11-04  101590.523438 -0.046524       1.0      1.0      0.0 -1.979591
return      1.000000
topic_1     0.133762
sent_rl     0.123552
topic_0     0.040156
price       0.023667
topic_-1   -0.089742
Name: return, dtype: float6

In [None]:
# ====== 1. Prepare ML dataset ======
df_ml = df_join.copy()
df_ml = df_ml.sort_values("date")
df_ml["return_next"] = df_ml["return"].shift(-1)
df_ml["up"] = (df_ml["return_next"] > 0).astype(int)
df_ml = df_ml.dropna(subset=["return_next"])

topic_cols = [c for c in df_ml.columns if c.startswith("topic_")]
feat_cols = topic_cols + ["sent_rl"]

X = df_ml[feat_cols]
y = df_ml["up"]

# ====== 2. Time-based train/test split ======
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=False
)

# ====== 3. Model ======
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

# ====== 4. Metrics ======

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# ====== 5. Feature importance ======

logreg = clf.named_steps["logreg"]

coef_df = pd.DataFrame({
    "feature": feat_cols,
    "coef": logreg.coef_[0]   # coefficient vector for class 1
}).sort_values("coef", ascending=False)

print("\n=== Feature Importance ===")
print(coef_df)


Accuracy: 0.18181818181818182
Precision: 0.0
Recall: 0.0
F1: 0.0
AUC: 0.07142857142857145

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.33      0.29      0.31         7
           1       0.00      0.00      0.00         4

    accuracy                           0.18        11
   macro avg       0.17      0.14      0.15        11
weighted avg       0.21      0.18      0.20        11


=== Feature Importance ===
    feature      coef
0  topic_-1  0.751712
1   topic_0  0.339026
2   topic_1  0.126523
3   sent_rl -1.035602
