In [None]:
import os
import json
from google.cloud import storage
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset
import torch
import pandas as pd
from tqdm import tqdm
import re
import csv
import hashlib

In [None]:
os.environ["HF_DATASETS_CACHE"] = "/mnt/disks/data/hf_cache"
os.environ["HF_DATASETS_OFFLINE"] = "1"  # Optional: if you're only working locally
os.environ["TRANSFORMERS_CACHE"] = "/mnt/disks/data/transformers_cache"
os.environ["TMPDIR"] = "/mnt/disks/data/tmp"

In [None]:
# Detect number of GPUs
gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {gpus}")
print(torch.cuda.get_device_name(0))
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Setup
BUCKET_NAME = "diss_market_data"
MODEL_PREFIX = "deberta_news/"
MODEL_LOCAL_DIR = "./deberta_news"
DATASET = "danidanou/Bloomberg_Financial_News"
OUTPUT_CSV_PATH = "./Data/news_sentiment_scores.csv"
CHUNK_SIZE = 512
CHUNK_STRIDE = 128

In [None]:
# ---------------------- GCS Download Helpers ---------------------
def download_from_gcs(bucket_name, prefix, local_dir):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)

    for blob in blobs:
        if blob.name.endswith('/'):  # Skip folders
            continue
        rel_path = os.path.relpath(blob.name, prefix)
        local_path = os.path.join(local_dir, rel_path)
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        blob.download_to_filename(local_path)

# Download model if not present
if not os.path.exists(os.path.join(MODEL_LOCAL_DIR, "pytorch_model.bin")):
    download_from_gcs(BUCKET_NAME, MODEL_PREFIX, MODEL_LOCAL_DIR)

In [None]:
# ------------------------ Load Model -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_LOCAL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_LOCAL_DIR).to(DEVICE)
model.eval()

In [None]:
def get_section_chunks(text, max_len=512, stride=128):
    tokens = tokenizer(text, truncation=False, padding=False, return_tensors='pt')['input_ids'][0]
    chunks = [tokens[i:i+max_len] for i in range(0, len(tokens), max_len - stride)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

In [None]:
def batched_predict(chunks):
    inputs = tokenizer(chunks, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}  # ✅ ensure proper device
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()
    labels = probs.argmax(axis=1)
    label_map = ['negative', 'neutral', 'positive']
    results = []
    for i, score in enumerate(probs):
        label = label_map[labels[i]]
        scaled_score = max(score) * (1 if label == 'positive' else -1 if label == 'negative' else 0)
        results.append((scaled_score, label))
    return results

In [None]:
dataset = load_dataset(DATASET)

In [None]:
dataset['train']

In [None]:
def compute_id(row):
    return hashlib.md5((row['Headline']).encode('utf-8')).hexdigest()

# Load existing processed records
if os.path.exists(OUTPUT_CSV_PATH):
    done_df = pd.read_csv(OUTPUT_CSV_PATH)
    done_ids = set(done_df.apply(lambda row: compute_id(row), axis=1))
else:
    done_ids = set()

In [None]:
def process_record(record):
    date = record.get('Date', '')
    
    results = []

    headline = record.get('Headline', '').strip()
    text = record.get('Article', '').strip()
    
    record_id = hashlib.md5((headline).encode('utf-8')).hexdigest()
    if record_id in done_ids:
        return []
    chunks = get_section_chunks(text)
    if not text or not chunks:
        return []

    sentiments = batched_predict(chunks)
    scores = [s for s, _ in sentiments]
    labels = [l for _, l in sentiments]

    avg_score = sum(scores) / len(scores)
    majority_label = max(set(labels), key=labels.count)

    results.append({
        'Article': headline,
        'Date': date,
        'Sentiment Score': avg_score,
        'Sentiment Label': majority_label
    })
    
    return results

In [None]:
# # ------------------ Load & Stream Dataset -------------------------
# splits = ["train"]
# all_results = []

# for split in splits:
#     ds = dataset[split]
#     for record in tqdm(ds, desc=f"Processing {split}"):
#         results = process_record(record)
#         all_results.extend(results)

In [None]:
# # ------------------ Save Results ----------------------------------
# df = pd.DataFrame(all_results)
# df.to_csv(OUTPUT_CSV_PATH, index=False)

In [None]:
# # Create CSV and write header once
# with open(OUTPUT_CSV_PATH, mode='w', newline='', encoding='utf-8') as f:
#     writer = csv.DictWriter(f, fieldnames=[
#         'Article', 'Date', 'Sentiment Score', 'Sentiment Label'
#     ])
#     writer.writeheader()

In [None]:
buffer = []
splits = ["train"]
for split in splits:
    ds = dataset[split]
    for record in tqdm(ds, desc=f"Processing {split}"):
        results = process_record(record)
        if results:
            buffer.extend(results)
        if len(buffer) >= 1000:
            with open(OUTPUT_CSV_PATH, mode='a', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=[
                    'Article', 'Date', 'Sentiment Score', 'Sentiment Label'
                ])
                writer.writerows(buffer)
            buffer = []  # clear buffer

# Final flush
if buffer:
    with open(OUTPUT_CSV_PATH, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'Article', 'Date', 'Sentiment Score', 'Sentiment Label'
        ])
        writer.writerows(buffer)