In [1]:
import os
import sys
import pandas as pd

# Add project root so you can import modules if needed
project_root = os.path.abspath("..")
sys.path.append(project_root)

In [2]:
import kagglehub
import json

# Download dataset from Kaggle
path = kagglehub.dataset_download("rdolphin/financial-news-with-ticker-level-sentiment")
print("Dataset path:", path)

# Load the JSON file (same as your friend)
file_path = f"{path}/polygon_news_sample.json"

with open(file_path, "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)
df.head()

  from .autonotebook import tqdm as notebook_tqdm


Dataset path: /Users/arnaudthomas/.cache/kagglehub/datasets/rdolphin/financial-news-with-ticker-level-sentiment/versions/1


Unnamed: 0,article_url,author,description,id,image_url,amp_url,keywords,published_utc,publisher,tickers,insights,title
0,https://www.zacks.com/stock/news/2114673/alleg...,Zacks.com,Allegiant Travel (ALGT) is a fast-moving stock...,db221630f08d9064b6539534cc9957ecd7ba2a626927c7...,https://staticx-tuner.zacks.com/images/default...,,"[Allegiant Travel, momentum investing, fast-pa...",2023-06-29T12:50:06Z,"{'name': 'Zacks Investment Research', 'homepag...",[ALGT],"[{'ticker': 'ALGT', 'sentiment': 'positive', '...",Allegiant Travel (ALGT) Is Attractively Priced...
1,https://www.zacks.com/stock/news/2085677/appli...,Zacks.com,Applied Industrial Technologies (AIT) reported...,bb7e1725949a7254ae18e8d149c3c19af050c0ac05f18f...,https://staticx-tuner.zacks.com/images/default...,,"[earnings, revenues, estimates, industrial pro...",2023-04-27T11:55:14Z,"{'name': 'Zacks Investment Research', 'homepag...","[AIT, NPO]","[{'ticker': 'AIT', 'sentiment': 'positive', 's...",Applied Industrial Technologies (AIT) Q3 Earni...
2,https://www.globenewswire.com/news-release/202...,,"Apollo Commercial Real Estate Finance, Inc. (A...",a49c53ef44092946950dfb3f33852c9ef07d7c7dc6c1ea...,https://ml.globenewswire.com/Resource/Download...,,"[commercial real estate, financing, mortgage l...",2023-03-06T13:30:00Z,"{'name': 'GlobeNewswire Inc.', 'homepage_url':...","[ARI, SAN]","[{'ticker': 'ARI', 'sentiment': 'positive', 's...","Apollo Commercial Real Estate Finance, Inc. Cl..."
3,https://www.globenewswire.com/news-release/202...,,"Maravai LifeSciences, a global provider of lif...",be4f5174307cd0f3309ee931ab4ec4fc2451af056769ca...,https://ml.globenewswire.com/Resource/Download...,,"[Maravai LifeSciences, investor conferences, f...",2023-11-09T13:15:00Z,"{'name': 'GlobeNewswire Inc.', 'homepage_url':...",[MRVI],"[{'ticker': 'MRVI', 'sentiment': 'positive', '...",Maravai LifeSciences Announces November 2023 I...
4,https://www.zacks.com/stock/news/2069321/dht-h...,Zacks Equity Research,"DHT Holdings, an independent oil tanker compan...",29bea2bb15df75a10fd940c2dc705d21d4c413fb45c17a...,https://staticx-tuner.zacks.com/images/default...,,"[DHT Holdings, oil tanker, earnings, revenue, ...",2023-03-22T22:00:25Z,"{'name': 'Zacks Investment Research', 'homepag...",[DHT],"[{'ticker': 'DHT', 'sentiment': 'neutral', 'se...",DHT Holdings (DHT) Stock Moves -1.33%: What Yo...


In [3]:
def extract_sentiments(df):
    df[["sentiment", "sentiment_reasoning"]] = df["insights"].apply(
        lambda x: pd.Series({
            "sentiment": x[0]["sentiment"],
            "sentiment_reasoning": x[0]["sentiment_reasoning"]
        })
    )

    df = df[["description", "sentiment", "sentiment_reasoning", "title"]]
    return df

df = extract_sentiments(df)
df.head()


Unnamed: 0,description,sentiment,sentiment_reasoning,title
0,Allegiant Travel (ALGT) is a fast-moving stock...,positive,The article highlights Allegiant Travel's fast...,Allegiant Travel (ALGT) Is Attractively Priced...
1,Applied Industrial Technologies (AIT) reported...,positive,The company reported better-than-expected earn...,Applied Industrial Technologies (AIT) Q3 Earni...
2,"Apollo Commercial Real Estate Finance, Inc. (A...",positive,The article highlights that the company has se...,"Apollo Commercial Real Estate Finance, Inc. Cl..."
3,"Maravai LifeSciences, a global provider of lif...",positive,The article highlights Maravai LifeSciences' p...,Maravai LifeSciences Announces November 2023 I...
4,"DHT Holdings, an independent oil tanker compan...",neutral,The article provides a neutral assessment of D...,DHT Holdings (DHT) Stock Moves -1.33%: What Yo...


In [4]:
df["text"] = df["title"].fillna("") + ". " + df["description"].fillna("")
df["text"] = df["text"].str.strip()
df = df[["text", "sentiment"]]
df.head()

Unnamed: 0,text,sentiment
0,Allegiant Travel (ALGT) Is Attractively Priced...,positive
1,Applied Industrial Technologies (AIT) Q3 Earni...,positive
2,"Apollo Commercial Real Estate Finance, Inc. Cl...",positive
3,Maravai LifeSciences Announces November 2023 I...,positive
4,DHT Holdings (DHT) Stock Moves -1.33%: What Yo...,neutral


In [5]:
label2id = {"positive": 0, "negative": 1, "neutral": 2}
df["labels"] = df["sentiment"].map(label2id)
df = df[["text", "labels"]]
df.head()


Unnamed: 0,text,labels
0,Allegiant Travel (ALGT) Is Attractively Priced...,0.0
1,Applied Industrial Technologies (AIT) Q3 Earni...,0.0
2,"Apollo Commercial Real Estate Finance, Inc. Cl...",0.0
3,Maravai LifeSciences Announces November 2023 I...,0.0
4,DHT Holdings (DHT) Stock Moves -1.33%: What Yo...,2.0


In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)
train_ds = dataset["train"]
val_ds = dataset["test"]

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["text"])
val_ds = val_ds.remove_columns(["text"])

train_ds.set_format("torch")
val_ds.set_format("torch")

Map: 100%|██████████| 4993/4993 [00:00<00:00, 16426.67 examples/s]
Map: 100%|██████████| 555/555 [00:00<00:00, 17782.79 examples/s]


In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "ProsusAI/finbert",
    num_labels=3,
)

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./finbert_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=20,
)

ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.