In [1]:
#Author: Yiying Jiao
#Step 3 Use FinBert to calculate sentiment scores for each headlines

In [2]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np


In [5]:
meta_df = pd.read_csv("22-23_main_df.csv.csv")
#get the correct format for date for both df
meta_df['date'] = pd.to_datetime(meta_df['date'], format='%Y%m%d')

meta_df = meta_df.sort_values("date")

In [8]:

# Load FinBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

In [9]:

# Extract headline from URL
import re
from urllib.parse import urlparse

headlines = []
urls = meta_df['url']

for url in urls:
    # Parse the path from the URL
    path = urlparse(url).path  

    # Remove leading numbers, trailing file extensions, or random alphanumeric codes
    path = re.sub(r'^\d{4}(/\d{2})?(/\d{2})?|news/\d+', '', path)  # Remove dates or numbers before words
    path = re.sub(r'/[A-Za-z0-9]+$', '', path)  # Remove trailing alphanumeric IDs

    # Replace dashes/underscores with spaces
    headline = re.sub(r"[-_]", " ", path).strip()

    # If headline is still empty, use 'Unknown'
    if not headline:
        headline = "Unknown"

    headlines.append(headline)  

# Add cleaned headlines to DataFrame
meta_df['headline'] = headlines


In [10]:
print(meta_df['headline'])

0        /business/2022/01/02/just how big in media doe...
16       /pmn/business pmn/is apple worth 3 trillion bu...
14       /dining around apple valleys pampa 172859691.html
13                             //01/03/apple fast facts 2/
12       /2022/01/03/analysis is apple worth 3 trillion...
                               ...                        
17034    /2023/12/29/news/florida family uses apple air...
17035    /article/technology/tech news technology/tech ...
17036    /2023/12/union minister chandrasekhar says rep...
17029    /news/india/rajeev chandrashekhar rebuts washi...
17058    /smart phone/apple cleared to resume flagship ...
Name: headline, Length: 17059, dtype: object


In [11]:
# Function to get sentiment probabilities
def get_sentiment(headline):
    inputs = tokenizer(headline, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits.detach().numpy()[0]
    probs = np.exp(logits) / np.sum(np.exp(logits))  # Softmax
    return probs

# Example DataFrame

meta_df["sentiment_probs"] = meta_df["headline"].apply(get_sentiment)

print(meta_df)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


            date                                                url  \
0     2022-01-03  https://www.economist.com/business/2022/01/02/...   
16    2022-01-03  https://montrealgazette.com/pmn/business-pmn/i...   
14    2022-01-03  https://news.yahoo.com/dining-around-apple-val...   
13    2022-01-03  https://ktvz.com/news/2022/01/03/apple-fast-fa...   
12    2022-01-03  https://wtbx.com/2022/01/03/analysis-is-apple-...   
...          ...                                                ...   
17034 2023-12-29  https://nypost.com/2023/12/29/news/florida-fam...   
17035 2023-12-29  https://indianexpress.com/article/technology/t...   
17036 2023-12-29  https://yourstory.com:443/2023/12/union-minist...   
17029 2023-12-29  https://economictimes.indiatimes.com/news/indi...   
17058 2023-12-29  https://www.telecomlead.com/smart-phone/apple-...   

                                                headline  \
0      /business/2022/01/02/just how big in media doe...   
16     /pmn/business pmn/is

In [12]:
#break sentiment probs array into 3 elements
meta_df["pos"] = meta_df["sentiment_probs"].apply(lambda x: x[0])
meta_df["neut"] = meta_df["sentiment_probs"].apply(lambda x: x[1])
meta_df["neg"] = meta_df["sentiment_probs"].apply(lambda x: x[2])
meta_df["sentiment_score"] = meta_df["sentiment_probs"].apply(lambda x: -x[2] + x[0])


In [13]:
probs_df=meta_df.drop(['url','headline','sentiment_probs'], axis=1)

probs_df.to_csv("/Users/jiao/projects/test22_23_probs_df.csv", index=False)