In [1]:
import pandas as pd
import re
from tqdm import tqdm

df_articles = pd.read_excel("article_columns_filtered.xlsx")
df_etf = pd.read_csv("VOO_historical_data.csv")

In [3]:
# The original code was prompted by ChatGPT and then changed according to our use case

def extract_info(llm_output):
    if pd.isna(llm_output):
        return pd.Series([None, None, None, None])
    
    summary_match = re.search(r"Summary: (.*?)\n", llm_output)
    sentiment_match = re.search(r"Overall sentiment: (.*?)\n", llm_output)
    relevance_match = re.search(r"Relevance to company: (.*?)\n", llm_output)
    company_match = re.search(r"Company Name: (.*)", llm_output)
    
    summary = summary_match.group(1) if summary_match else None
    sentiment = sentiment_match.group(1) if sentiment_match else None
    relevance = relevance_match.group(1) if relevance_match else None
    company = company_match.group(1) if company_match else None
    
    return pd.Series([summary, company, sentiment, relevance])

# Convert dates to datetime format
df_articles['pub_date'] = pd.to_datetime(df_articles['pub_date'], utc=True).dt.date
df_etf['Date'] = pd.to_datetime(df_etf['Date'], utc=True).dt.date

# Merge datasets on date
merged_df = pd.merge(df_articles, df_etf, left_on='pub_date', right_on='Date')

tqdm.pandas()
merged_df[['Summary', 'Company Name', 'Overall Sentiment', 'Relevance to company']] = merged_df.progress_apply(lambda row: extract_info(row['LLM_Output']), axis=1)
merged_df = merged_df[merged_df['Summary'].notna()]

# Create the label column
merged_df['label'] = merged_df.apply(lambda row: 'POSITIVE' if row['Close'] > row['Open'] else 'NEGATIVE', axis=1)

# Select relevant columns for the final dataset
final_df = merged_df[['pub_date', 'full_article', 'Summary', 'Company Name', 'Overall Sentiment', 'Relevance to company', 'Open', 'Close', 'label']]


100%|██████████████████████████████████| 30740/30740 [00:02<00:00, 12173.56it/s]


In [7]:
final_df.to_csv("Full_Data_Sentiment_Analysis_LLM_Output_ETF_value_Label.csv")