In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
from newsapi import NewsApiClient
from datetime import datetime, timedelta
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from random import sample
import spacy
from gdeltdoc import GdeltDoc, Filters
from newspaper import Article
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [None]:
# -------- Settings --------
ASSET = "BTC-USD"  
QUERIES = {
     "bitcoin","crypto","cryptocurrency","ethereum","blockchain",
    "binance","coinbase","solana","xrp","defi","web3","nft",
    "mining","hashrate","regulation","hack","exchange","stablecoin"
}



  # keywords to search for in news sources
N_DAYS = 30 
NEWSAPI_KEY = "5bcdfd2037114aa591e1cff0ccd93b81" 

# -------- 1. Get Prices --------
end = datetime.utcnow()
start = end - timedelta(days=N_DAYS + 5)  

prices = yf.download(ASSET, start=start, end=end, interval="1d", group_by="column")

if isinstance(prices.columns, pd.MultiIndex):
    prices.columns = prices.columns.get_level_values(0)
    
    
prices = prices[["Close"]].rename(columns={"Close": "price"})
prices["return"] = prices["price"].pct_change()

prices = prices.reset_index()    
prices["date"] = prices["Date"].dt.normalize()
prices_daily = prices[["date", "price", "return"]]

print(prices.head())


  end = datetime.utcnow()
[*********************100%***********************]  1 of 1 completed

Price       Date          price    return       date
0     2025-10-28  112956.164062       NaN 2025-10-28
1     2025-10-29  110055.304688 -0.025681 2025-10-29
2     2025-10-30  108305.546875 -0.015899 2025-10-30
3     2025-10-31  109556.164062  0.011547 2025-10-31
4     2025-11-01  110064.015625  0.004636 2025-11-01






Get news from GDELT Database 

In [4]:

gd = GdeltDoc()

start_date = (end - timedelta(days=N_DAYS)).strftime("%Y-%m-%d")
end_date   = end.strftime("%Y-%m-%d")

all_news = []

for q in QUERIES:
  

    f = Filters(
        keyword=q,
        start_date=start_date,
        end_date=end_date,
        num_records=250,
        language="English"
    )

    try:
        df_q = gd.article_search(f)
    except ValueError as e:
        print(f"Skip '{q}' → {e}")
        continue

    if df_q is None or df_q.empty:
        print(f"{q}: 0 articles")
        continue

    df_q["query"] = q
    print(f"{q}: {len(df_q)} articles")
    all_news.append(df_q)

news_raw = pd.concat(all_news, ignore_index=True)
news_raw = news_raw.drop_duplicates(subset="url")
print("Unique URLs:", len(news_raw))



ethereum: 250 articles
Skip 'hack' → The query was not valid. The API error message was: The specified phrase is too short.
Skip 'hack' → The query was not valid. The API error message was: The specified phrase is too short.
regulation: 250 articles
regulation: 250 articles
mining: 250 articles
mining: 250 articles
bitcoin: 250 articles
bitcoin: 250 articles
solana: 250 articles
solana: 250 articles
Skip 'xrp' → The query was not valid. The API error message was: The specified phrase is too short.
Skip 'xrp' → The query was not valid. The API error message was: The specified phrase is too short.
hashrate: 138 articles
hashrate: 138 articles
binance: 250 articles
binance: 250 articles
coinbase: 250 articles
coinbase: 250 articles
crypto: 250 articles
crypto: 250 articles
cryptocurrency: 250 articles
cryptocurrency: 250 articles
Skip 'nft' → The query was not valid. The API error message was: The specified phrase is too short.
Skip 'nft' → The query was not valid. The API error message w

We succeeded getting around 2900 financials articles

Get news from GDELT Database gave us links to 2900 articles about finance, we now need to scrape the content of these articles. We only scrape a sample of 100 articles for now to test our pipeline. Indeed, scraping all articles may take a lot of time and resources. 


In [14]:
def fetch_article(url):
    try:
        article = Article(url, language="en")
        article.download()
        article.parse()
        txt = article.text
        return txt if len(txt) > 100 else None
    except:
        return None

urls_sample = news_raw["url"].dropna().unique()[:500]  

with ThreadPoolExecutor(max_workers=10) as ex:
    fulltexts = list(ex.map(fetch_article, urls_sample))

url_to_text = dict(zip(urls_sample, fulltexts))

news_raw["fulltext"] = news_raw["url"].map(url_to_text)

news_clean = news_raw.dropna(subset=["fulltext"])
print("Articles scraped:", len(news_clean))
print(news_clean["fulltext"].head())

Articles scraped: 436
0    The Fusaka upgrade to Ethereum, expected to go...
1    alfernec / Shutterstock.com\n\nQuick Read\n\nB...
2    Key Points\n\nEthereum surged more than 12% ov...
3    Key Points\n\nEthereum has benefited from inst...
4    Coming out of weeks of downtrend, the Ethereum...
Name: fulltext, dtype: object


We used Spacy to preprocess the text data, including tokenization, lemmatization, and removal of stop words. This will help us prepare the data for further analysis and modeling.

In [15]:

nlp = spacy.load("en_core_web_sm", disable=["ner","parser"])

def clean_text_spacy(txt):
    doc = nlp(txt.lower())
    return " ".join([
        tok.lemma_
        for tok in doc
        if tok.is_alpha and not tok.is_stop and len(tok) > 2
    ])

news_clean["clean_text"] = news_clean["fulltext"].apply(clean_text_spacy)


In [18]:
docs = news_clean["clean_text"].tolist()

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)

topics, probs = topic_model.fit_transform(docs)

news_clean["topic"] = topics
news_clean["topic_probmax"] = [
    float(np.max(p)) if p is not None else np.nan for p in probs
]

print("haha",news_clean["topic"].value_counts())

2025-12-02 15:43:27,011 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 14/14 [00:13<00:00,  1.08it/s]
2025-12-02 15:43:42,797 - BERTopic - Embedding - Completed ✓
2025-12-02 15:43:42,797 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
Batches: 100%|██████████| 14/14 [00:13<00:00,  1.08it/s]
2025-12-02 15:43:42,797 - BERTopic - Embedding - Completed ✓
2025-12-02 15:43:42,797 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-02 15:43:43,312 - BERTopic - Dimensionality - Completed ✓
2025-12-02 15:43:43,314 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-02 15:43:43,312 - BERTopic - Dimensionality - Completed ✓
2025-12-02 15:43:43,314 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-02 15:43:43,353 - BERTopic - Cluster - Completed ✓
2025-12-02 15:43:43,358 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-

haha topic
 0    233
 1    193
-1     10
Name: count, dtype: int64


In [19]:

news_df = news_clean.copy()

news_df["datetime"] = pd.to_datetime(news_df["seendate"], format="%Y%m%dT%H%M%SZ")
news_df["date"] = news_df["datetime"].dt.date

news_df = news_clean.copy()

news_df["datetime"] = pd.to_datetime(news_df["seendate"], format="%Y%m%dT%H%M%SZ")
news_df["date"] = news_df["datetime"].dt.date

news_df = news_df.rename(columns={
    "domain": "source",
    "fulltext": "text",
})


news_df = news_df[["datetime", "date", "source", "title", "text", "clean_text", "topic", "topic_probmax"]]

df_time = news_df[["clean_text", "datetime", "topic"]].rename(columns={
    "clean_text": "Document",
    "datetime": "Timestamp",
    "topic": "Topic",
})

topics_over_time = topic_model.topics_over_time(
    docs=df_time["Document"].tolist(),
    topics=df_time["Topic"].tolist(),
    timestamps=df_time["Timestamp"].tolist()
)

print(topics_over_time.head())


0it [00:00, ?it/s]

376it [00:06, 60.46it/s]

   Topic                                              Words  Frequency  \
0      1  starlink, feldstein, country, pfeiffer, satellite          1   
1      0       ethereum, accumulation, eth, breakout, thief          1   
2      0     bhutan, ethereum, digital, miyaguchi, national          1   
3      0                prescient, fund, ametf, jse, market          1   
4      0            price, ethereum, market, liquidity, eth          1   

            Timestamp  
0 2025-11-02 00:00:00  
1 2025-11-02 01:00:00  
2 2025-11-02 17:00:00  
3 2025-11-02 22:30:00  
4 2025-11-02 23:30:00  





In [20]:

topics_words = []
for topic_id in set(topics):
    if topic_id == -1: continue
    words = [w for w, _ in topic_model.get_topic(topic_id)]
    topics_words.append(words)
    
docs_tokenized = [doc.split() for doc in news_df["text"]]


MAX_DOCS = 300
if len(docs_tokenized) > MAX_DOCS:
    docs_tokenized_sample = sample(docs_tokenized, MAX_DOCS)
else:
    docs_tokenized_sample = docs_tokenized

dictionary = Dictionary(docs_tokenized)


cm = CoherenceModel(
    topics=topics_words,
    texts=docs_tokenized,
    dictionary=dictionary,
    coherence='c_v',
    processes=4
)

coherence = cm.get_coherence()
print("Topic Coherence:", coherence)

clusterer = topic_model.hdbscan_model
stabilities = clusterer.cluster_persistence_
print(stabilities)

topic_model.visualize_topics_over_time(topics_over_time)
topic_model.visualize_barchart(top_n_topics=10)
topic_model.visualize_heatmap()


Topic Coherence: 0.33745202238708905
[0.27609365 0.28762192]


Sentiment analysis with FinBERT

In [None]:


tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

def finbert_sentiment(text):
    try:
        if not isinstance(text, str) or len(text.strip()) == 0:
            return (0, 1, 0)

        inputs = tokenizer(
            text[:512],                
            return_tensors="pt",
            truncation=True
        )
        with torch.no_grad():
            logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=1).numpy()[0]

        return float(probs[0]), float(probs[1]), float(probs[2])  

    except Exception as e:
        print("Error processing text:", e)
        return (0, 1, 0)


sentiments = news_df["text"].apply(finbert_sentiment)

news_df["sent_neg"] = sentiments.apply(lambda x: x[0])
news_df["sent_neu"] = sentiments.apply(lambda x: x[1])
news_df["sent_pos"] = sentiments.apply(lambda x: x[2])


news_df["sent_dir"] = news_df["sent_pos"] - news_df["sent_neg"]


news_df["length_norm"] = news_df["text"].str.len() / news_df["text"].str.len().max()

source_freq = news_df["source"].value_counts()
source_importance = 1 / (1 + source_freq)
news_df["source_weight"] = news_df["source"].map(source_importance).fillna(0.5)

news_df["hours_since"] = (datetime.utcnow() - news_df["datetime"]).dt.total_seconds() / 3600
news_df["recency_weight"] = np.exp(- news_df["hours_since"] / 24)


params = {
    "ws": 1.0, 
    "wt": 0.8,  
    "wr": 0.9,  
    "wl": 0.5,  
    "wb": 1.0   
}

def compute_rl_sentiment(df, params):
    return (
        (1  +
        params["wt"] * df["topic_probmax"] +
        params["wr"] * df["recency_weight"] +
        params["wl"] * df["length_norm"] +
        params["wb"] * df["source_weight"]) * df["sent_dir"]
    )

lr = 0.01  

prices_daily["date"] = pd.to_datetime(prices_daily["date"])
news_df["date"] = pd.to_datetime(news_df["date"])

for epoch in range(200):
    news_df["sent_rl"] = compute_rl_sentiment(news_df, params)

    sent_daily = (
        news_df
        .groupby("date")["sent_rl"]
        .sum()
        .reset_index() 
    )

    df_tmp = pd.merge(
        prices_daily[["date", "return"]],
        sent_daily,
        on="date",
        how="inner"
    )

    corr = df_tmp["sent_rl"].shift(1).corr(df_tmp["return"])

    if pd.isna(corr):
        continue  

    for k in params:
        params[k] += lr * corr

print("optimized params:", params)


news_df["sent_rl"] = compute_rl_sentiment(news_df, params)

sent_daily_rl = (
    news_df
    .groupby("date")["sent_rl"]
    .sum()
    .reset_index()
)

print(sent_daily_rl.head())


print(news_df[["title", "sent_neg", "sent_neu", "sent_pos", "sent_rl"]].head())


In [None]:
topic_daily_counts = (
    news_df
    .groupby(["date", "topic"])
    .size()
    .reset_index(name="count")
)

topic_daily_pivot = topic_daily_counts.pivot(
    index="date", columns="topic", values="count"
).fillna(0)

topic_daily_pivot.columns = [f"topic_{c}" for c in topic_daily_pivot.columns]
topic_daily_pivot.index = pd.to_datetime(topic_daily_pivot.index)
topic_daily_pivot = topic_daily_pivot.reset_index()  

topic_daily_pivot["date"] = pd.to_datetime(topic_daily_pivot["date"])
sent_daily_rl["date"] = pd.to_datetime(sent_daily_rl["date"])

daily_features = pd.merge(
    topic_daily_pivot,
    sent_daily_rl,      
    on="date",
    how="left"
).fillna(0)

prices_daily["date"] = pd.to_datetime(prices_daily["date"]).dt.date
daily_features["date"] = pd.to_datetime(daily_features["date"]).dt.date

df_join = pd.merge(
    prices_daily,
    daily_features,
    on="date",
    how="left"
).fillna(0)

print(df_join.head(10))

corr = df_join.corr(numeric_only=True)["return"].sort_values(ascending=False)
print(corr)

topic_id = 3
print(topic_model.get_topic(topic_id))


         date          price    return  topic_-1  topic_0  topic_1   sent_rl
0  2025-10-26  114472.445312  0.000000       0.0      0.0      0.0  0.000000
1  2025-10-27  114119.328125 -0.003085       0.0      0.0      0.0  0.000000
2  2025-10-28  112956.164062 -0.010193       0.0      0.0      0.0  0.000000
3  2025-10-29  110055.304688 -0.025681       0.0      0.0      0.0  0.000000
4  2025-10-30  108305.546875 -0.015899       0.0      0.0      0.0  0.000000
5  2025-10-31  109556.164062  0.011547       2.0      0.0      1.0 -4.047515
6  2025-11-01  110064.015625  0.004636       0.0      1.0      0.0  0.896683
7  2025-11-02  110639.625000  0.005230       2.0      2.0      0.0  4.138598
8  2025-11-03  106547.523438 -0.036986       1.0      0.0      0.0 -1.116493
9  2025-11-04  101590.523438 -0.046524       1.0      1.0      0.0 -1.979591
return      1.000000
topic_1     0.133762
sent_rl     0.123552
topic_0     0.040156
price       0.023667
topic_-1   -0.089742
Name: return, dtype: float6

In [None]:
df_ml = df_join.copy()
df_ml = df_ml.sort_values("date")
df_ml["return_next"] = df_ml["return"].shift(-1)
df_ml["up"] = (df_ml["return_next"] > 0).astype(int)
df_ml = df_ml.dropna(subset=["return_next"])

topic_cols = [c for c in df_ml.columns if c.startswith("topic_")]
feat_cols = topic_cols + ["sent_rl"]

X = df_ml[feat_cols]
y = df_ml["up"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=False
)

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))


logreg = clf.named_steps["logreg"]

coef_df = pd.DataFrame({
    "feature": feat_cols,
    "coef": logreg.coef_[0]  
}).sort_values("coef", ascending=False)

print("\n=== Feature Importance ===")
print(coef_df)


Accuracy: 0.18181818181818182
Precision: 0.0
Recall: 0.0
F1: 0.0
AUC: 0.07142857142857145

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.33      0.29      0.31         7
           1       0.00      0.00      0.00         4

    accuracy                           0.18        11
   macro avg       0.17      0.14      0.15        11
weighted avg       0.21      0.18      0.20        11


=== Feature Importance ===
    feature      coef
0  topic_-1  0.751712
1   topic_0  0.339026
2   topic_1  0.126523
3   sent_rl -1.035602
