<a href="https://colab.research.google.com/github/alyssa-tsh/CS3244_ML_Project/blob/main/A0258887B_Assignment1_DSA4265.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
%pip install datasets pandas numpy scikit-learn transformers torch evaluate accelerate matplotlib seaborn tqdm




Collecting datasets
  Using cached datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.10.0,>=2023.1.0 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.10.0-py3-none-any.whl.metadata (10 kB)
Collecting huggingface-hub<2.0,>=0.25.0 (from datasets)
  Using cached huggingface_hub-1.4.1-py3-none-any.whl.metadata (13 kB)
Using cached datasets-4.5.0-py3-none-any.whl (515 kB)
Downloading fsspec-2025.10.0-py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.0/201.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached huggingface_hub-1.4.1-py3-none-any.whl (553 kB)
Installing collected packages: fsspec, huggingface-hub, datasets
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2025.10.0 which is incompati

In [1]:
import pandas as pd
import numpy as np
import datasets
import torch
import sklearn
import transformers
import evaluate
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

# Problem Statement

## **Project Definition**

**Task:** Classify financial news articles on two dimensions:

1. **Market-Moving Impact** (High/Low Impact)
2. **Direction & Sector Impact** (Positive for Sector, Negative for Sector, Neutral/Mixed)

## **Project Workflow**

1. Direction Predicting --> finetune BERT on
*   FiQA Sentiment with labels - identify sentences/semantic group which indicate bullish/bearish conditions
*   Financial News Data

2. Impact Labeling
*   Proxy-labelling: Generate market-moving labels from Reuters via keyword/event rules

[*Due to the absence of observable price-based impact labels, we construct market-moving labels using event-type heuristics derived from financial domain knowledge. This constitutes proxy labelling, where corporate events such as earnings releases, mergers, and regulatory actions are treated as textual indicators of potential market materiality.*]

* Golden samples - manally label ~50
* Synthetic labelling using AI to generate labels and retrain model (human in the loop to verify labels)

3. Domain-Adpative Fine-Tuning
* Fine-tune direction aware BERT on market-moving task



# **Data**
### **1. Reuters News Dataset**

- **Source:** https://huggingface.co/datasets/reuters21578
- **Size:** 21,578 news articles
- **Labels:** No sentiment/binary classes, only topic categories
- **Usage:** Train baseline sentiment, for market-moving detection as High/Low Impact binary labels
- **Example mapping:** "earn" → Earnings (market-moving), "trade" → Sector-specific


### **2. FiQA Sentiment & Relevance**

- **Source:** https://huggingface.co/datasets/pauri32/fiqa-2018
- **Size:** ~17,000 sentences
- **Usage:** Detect direction based on target, sentiment_score, aspects involved with target stock/company info --> map to sector
- **Labels:** Sentiment scores, bullish, bearish

### **3. Financial News Data**

- **Source:** https://huggingface.co/datasets/dilkasithari-IT/sentiment_analysis_financial_news_data
- **Size:** ~22,740 entries
- **Usage:** Detect direction based on sentiments from new description in a field focused context  --> map to sector
- **Labels:** Positive, Negative, Neutral


# **Data Processsing**

In [13]:
from datasets import load_dataset
from google.colab import userdata
hf_token= userdata.get('huggingface')


# ---------------------------
# Step 2: Load FIQA dataset
# ---------------------------
print("Loading FIQA dataset...")
fiqa = load_dataset("pauri32/fiqa-2018", token=hf_token)

# Convert splits to pandas DataFrames
fiqa_dfs = []
for split_name, split_dataset in fiqa.items():
    df = split_dataset.to_pandas()
    df["split"] = split_name
    fiqa_dfs.append(df)
    print(f"FIQA {split_name} split: {len(df)} rows")

df_fiqa = pd.concat(fiqa_dfs, ignore_index=True)

# ---------------------------
# Step 3: Load Financial News dataset
# ---------------------------
print("Loading Financial News dataset...")
financial_news = load_dataset("dilkasithari-IT/sentiment_analysis_financial_news_data", token=hf_token)

# Convert splits to pandas DataFrames
fn_dfs = []
for split_name, split_dataset in financial_news.items():
    df = split_dataset.to_pandas()
    df["split"] = split_name
    fn_dfs.append(df)
    print(f"Financial News {split_name} split: {len(df)} rows")

df_financial_news = pd.concat(fn_dfs, ignore_index=True)

# ---------------------------
# Step 4: Save locally
# ---------------------------
# FIQA
df_fiqa.to_csv("fiqa_full.csv", index=False)
df_fiqa.to_parquet("fiqa_full.parquet", index=False)
print("Saved FIQA dataset to fiqa_full.csv and fiqa_full.parquet")

# Financial News
df_financial_news.to_csv("financial_news_full.csv", index=False)
df_financial_news.to_parquet("financial_news_full.parquet", index=False)
print("Saved Financial News dataset to financial_news_full.csv and financial_news_full.parquet")

# ---------------------------
# Step 5: Display samples nicely in Colab
# ---------------------------
pd.set_option("display.max_colwidth", 300)  # show longer text columns
print("FIQA sample:")
display(df_fiqa.head(5))

print("Financial News sample:")
display(df_financial_news.head(5))

Loading FIQA dataset...
FIQA train split: 961 rows
FIQA validation split: 102 rows
FIQA test split: 150 rows
Loading Financial News dataset...
Financial News train split: 16372 rows
Financial News validation split: 1820 rows
Financial News test split: 4548 rows
Saved FIQA dataset to fiqa_full.csv and fiqa_full.parquet
Saved Financial News dataset to financial_news_full.csv and financial_news_full.parquet
FIQA sample:


Unnamed: 0,sentence,snippets,target,sentiment_score,aspects,format,label,split
0,"Still short $LNG from $11.70 area...next stop could be down through $9.00. Someone slammed it hard with 230,000 shs this am! More to follow",['Still short $LNG from $11.70 area...next stop could be down through $9.00.'],LNG,-0.543,['Stock/Price Action/Volatility/Short Selling'],post,2,train
1,$PLUG bear raid,['bear raid'],PLUG,-0.48,['Stock/Price Action/Bearish'],post,2,train
2,How Kraft-Heinz Merger Came Together in Speedy 10 Weeks,['Merger Came Together in Speedy 10 Weeks'],Kraft,0.214,['Corporate/M&A/M&A'],headline,0,train
3,Slump in Weir leads FTSE down from record high,['down from record high'],Weir,-0.827,['Market/Volatility/Volatility'],headline,2,train
4,"$AAPL bounces off support, it seems",['bounces off support'],AAPL,0.443,['Stock/Price Action/Bullish/Bullish Behavior'],post,0,train


Financial News sample:


Unnamed: 0,Headline,Date,Related Field,News Description,combined_text,sentiment,sentiment_score,split
0,Sri Lanka poverty being addressed by Aswesuma safety net: Minister,4/4/2024,Economic Crisis,"Rise in poverty in a country that experienced negative 8 percent growth, and bankruptcy is normal, Gunawardana said","Sri Lanka poverty being addressed by Aswesuma safety net: Minister Rise in poverty in a country that experienced negative 8 percent growth, and bankruptcy is normal, Gunawardana said",negative,-1,train
1,Sri Lanka's daily COVID-19 cases may be three times what tests suggest: Dr Fernandopulle,5/20/2021,Health,"“We recommend that people be confined to their homes for at least two weeks to contain the spread.""","Sri Lanka's daily COVID-19 cases may be three times what tests suggest: Dr Fernandopulle “We recommend that people be confined to their homes for at least two weeks to contain the spread.""",negative,-1,train
2,Sri Lanka solar power costs down 20-pct as rupee strengthens,5/7/2024,Energy,Demand for solar power is coming back as the Return on Investment is now higher,Sri Lanka solar power costs down 20-pct as rupee strengthens Demand for solar power is coming back as the Return on Investment is now higher,positive,1,train
3,"Sri Lanka's ETI Finance, Swarnamahal depositors to be paid from July 25: Corrected",7/22/2020,Banking,"Sri Lanka's central bank said compensation of up to 600,000 rupees will be paid to depositors of the failed ETI Finance ...","Sri Lanka's ETI Finance, Swarnamahal depositors to be paid from July 25: Corrected Sri Lanka's central bank said compensation of up to 600,000 rupees will be paid to depositors of the failed ETI Finance ...",negative,-1,train
4,Isolated/locked down areas in Sri Lanka to control Coronavirus (May 21),5/18/2021,Health,Mullativu added from 2300 hours May 17,Isolated/locked down areas in Sri Lanka to control Coronavirus (May 21) Mullativu added from 2300 hours May 17,negative,-1,train


In [None]:
import re
import pandas as pd
import numpy as np
from typing import List, Dict, Any
import emoji
from bs4 import BeautifulSoup
from config import config

class TextCleaner:
    """Clean and normalize financial text data"""

    @staticmethod
    def clean_text(text: str, remove_emojis: bool = True) -> str:
        """Main text cleaning function"""
        if pd.isna(text):
            return ""

        text = str(text)

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)

        # Remove emojis
        if remove_emojis:
            text = emoji.replace_emoji(text, '')

        # Remove special characters but keep financial symbols
        text = re.sub(r'[^\w\s\$.€£¥-]', ' ', text)

        # Remove extra whitespace
        text = ' '.join(text.split())

        # Lowercase
        text = text.lower()

        return text.strip()

    @staticmethod
    def preserve_tickers(text: str) -> str:
        """Ensure stock tickers remain uppercase with $ sign"""
        tickers = re.findall(r'\$[A-Z]+', text)
        for ticker in tickers:
            text = text.replace(ticker.lower(), ticker)
        return text

class AspectMapper:
    """Map financial topics/aspects to consistent taxonomy"""

    ASPECT_MAPPING = {
        # Price action
        'stock price': 'price_action',
        'share price': 'price_action',
        'volatility': 'volatility',
        'trading': 'trading_volume',

        # Financial results
        'earnings': 'earnings',
        'revenue': 'revenue',
        'profit': 'profitability',
        'loss': 'profitability',

        # Corporate actions
        'merger': 'ma',
        'acquisition': 'ma',
        'dividend': 'dividend',
        'buyback': 'buyback',

        # Market events
        'ipo': 'ipo',
        'bankruptcy': 'bankruptcy',
        'lawsuit': 'legal',
        'regulation': 'regulatory',

        # Analyst actions
        'upgrade': 'analyst_rating',
        'downgrade': 'analyst_rating',
        'target price': 'price_target',
    }

    @classmethod
    def map_aspect(cls, aspect: str) -> str:
        """Map raw aspect to standardized taxonomy"""
        if pd.isna(aspect):
            return 'other'

        aspect = aspect.lower().strip()
        for key, value in cls.ASPECT_MAPPING.items():
            if key in aspect:
                return value
        return 'other'

class DataLoader:
    """Load and preprocess all datasets"""

    def __init__(self):
        self.cleaner = TextCleaner()

    def load_financial_news_sentiment(self, path: str) -> pd.DataFrame:
        """Load and preprocess Financial News Sentiment dataset"""
        df = pd.read_csv(path)

        # Clean text fields
        df['cleaned_headline'] = df['headline'].apply(self.cleaner.clean_text)
        if 'description' in df.columns:
            df['cleaned_description'] = df['description'].apply(self.cleaner.clean_text)

        # Map sentiment labels
        df['direction_label'] = df['sentiment_label'].map({
            'positive': 0,
            'neutral': 1,
            'negative': 2
        })

        # Filter low confidence scores
        df = df[abs(df['sentiment_score']) >= config.SENTIMENT_THRESHOLD]

        return df

    def load_fiqa(self, path: str) -> pd.DataFrame:
        """Load and preprocess FiQA dataset"""
        df = pd.read_csv(path)

        df['cleaned_text'] = df['post_text'].apply(self.cleaner.clean_text)
        df['direction_label'] = df['label']  # 0/1/2 already

        return df

    def load_reuters(self, path: str) -> pd.DataFrame:
        """Load and preprocess Reuters dataset"""
        df = pd.read_csv(path)

        # Combine headline and full text
        df['full_text'] = df['headline'] + ' ' + df['full_text']
        df['cleaned_text'] = df['full_text'].apply(self.cleaner.clean_text)

        # Process aspects/topics
        if 'topics' in df.columns:
            df['aspects'] = df['topics'].apply(
                lambda x: [AspectMapper.map_aspect(topic) for topic in eval(x)] if pd.notna(x) else []
            )

        return df

# **Labelling**

# **Model Initiation and Training**

# **Model Training**

In [None]:
from transformers import pipeline,  AutoTokenizer, AutoModelForSequenceClassification

In [None]:
pipe = pipeline("text-classification", model="ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")