In [1]:
import sys
print(sys.executable)


/Users/nataliasurzhak/tf-env/bin/python


In [2]:
# 1. Necessary imports and setup
import pandas as pd
import numpy as np
import torch
from transformers import pipeline
from pathlib import Path
#import matplotlib.pyplot as plt


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x12315d190>

In [3]:
# 2. Data loading - read only the necessary columns from downloaded CSV files,

ARCHIVE_DIR = Path("~/Desktop/SentiStock/archive").expanduser()

djia_file   = ARCHIVE_DIR / "djia_news copy.csv"
nasdaq_file = ARCHIVE_DIR / "nasdaq.csv"

df_djia   = pd.read_csv(djia_file,   usecols=["Ticker", "Headline"])
df_nasdaq = pd.read_csv(nasdaq_file, usecols=["Ticker", "Headline"])

df_djia["Source"]   = "DJIA"
df_nasdaq["Source"] = "NASDAQ"

df_all = pd.concat([df_djia, df_nasdaq], ignore_index=True)

In [4]:
# 3. Inspect the loaded data
print("DJIA news sample:")
print("DJIA news sample:")
display(df_djia.head())

print("NASDAQ news sample:")
display(df_nasdaq.head())

DJIA news sample:
DJIA news sample:


Unnamed: 0,Ticker,Headline,Source
0,MMM,Employer who stole nearly $3M in wages from 15...,DJIA
1,MMM,Huge new Facebook data leak exposed intimate d...,DJIA
2,MMM,A campaign has accelerated to turn a disused r...,DJIA
3,MMM,Google launches global human trafficking helpl...,DJIA
4,MMM,Over 3m Saudi Women Don’t Have ID Cards; Saudi...,DJIA


NASDAQ news sample:


Unnamed: 0,Ticker,Headline,Source
0,A,@TotesTravel : Airline shares tumble as New Yo...,NASDAQ
1,A,@TotesTravel : American United call off Hong K...,NASDAQ
2,A,@TotesTravel : U.S. airline stocks hit highest...,NASDAQ
3,A,@TotesTravel : American Airlines reaches deal ...,NASDAQ
4,A,@TotesTravel : US airlines Treasury Department...,NASDAQ


In [5]:
# 4. Goal: Filter headlines to only those mentioning healthcare-related terms
hc_terms = [
    "healthcare", "patient", "medicine", "hospital", "clinic",
    "doctor", "nurse", "pharma", "vaccine", "treatment"
]
pattern = "|".join(hc_terms)

df_hc = df_all[df_all["Headline"].str.lower().str.contains(pattern)].copy()

display(df_hc.head())

Unnamed: 0,Ticker,Headline,Source
48,MMM,Industry Cites 3M experiment that exposed canc...,DJIA
75,MMM,Canadian PM Justin Trudeau has said he will no...,DJIA
181,AMGN,Amgen exits neuroscience R&amp;D as pharma pul...,DJIA
187,AMGN,From Amgen to Gilead drugmakers are sitting on...,DJIA
229,AAPL,France To Take Legal Action Against Facebook G...,DJIA


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# 5. Load tokenizer and model manually (instead of pipeline)
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 6. Tokenize texts and build PyTorch Dataset
class FinBERTDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_len,
            return_tensors="pt"
        )

    def __len__(self):
        return self.encodings["input_ids"].size(0)

    def __getitem__(self, idx):
        return {
            key: val[idx] for key, val in self.encodings.items()
        }

texts = df_hc["Headline"].tolist()
dataset = FinBERTDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=32)

# 6b. Predict sentiment logits and apply softmax
all_scores = []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Scoring"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        for p in probs:
            all_scores.append([
                {"label": "NEGATIVE", "score": p[0].item()},
                {"label": "NEUTRAL",  "score": p[1].item()},
                {"label": "POSITIVE", "score": p[2].item()},
            ])

def to_scale_1_10(scores):
    """
    Convert FinBERT output scores into a [1,10] scale:
    - NEGATIVE → weight −1
    - NEUTRAL  → weight  0
    - POSITIVE → weight +1
    Then map linearly from [-1,1] to [1,10].
    """
    label2score = {item["label"].upper(): item["score"] for item in scores}
    neg = label2score.get("NEGATIVE", 0.0)
    pos = label2score.get("POSITIVE", 0.0)
    sentiment = pos - neg
    return float((sentiment + 1) * 4.5 + 1)
    
df_hc["Sentiment_1_10"] = [to_scale_1_10(s) for s in all_scores]


Scoring: 100%|██████████████████████████████████| 21/21 [00:13<00:00,  1.54it/s]


In [8]:
# 7. Save the healthcare-only sentiment scores to CSV
output_file = ARCHIVE_DIR / "healthcare_sentiment2.csv"
df_hc.to_csv(output_file, index=False)
print(f"Saved healthcare sentiment data to {output_file}")

Saved healthcare sentiment data to /Users/nataliasurzhak/Desktop/SentiStock/archive/healthcare_sentiment2.csv
