In [1]:
import pandas as pd
import yfinance as yf
import datetime
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('sp500_headlines_2008_2024.csv')

In [3]:
def is_headline_informative(headline):
    headline_lower = str(headline).lower()
    generic_phrases = [
        'stock market news for',
        'stocks making the biggest moves',
        'what to watch in the week ahead',
        'market update',
        'stocks to watch'
    ]
    if any(phrase in headline_lower for phrase in generic_phrases):
        return False
    if len(headline.split()) < 4:
        return False
    return True

In [6]:
original_size = len(df)
df = df[df['Title'].apply(is_headline_informative)].copy()
print(f"Filtered dataset now has {len(df)} headlines (removed {original_size - len(df)} generic headlines).")

Filtered dataset now has 18590 headlines (removed 0 generic headlines).


In [7]:
df['Date'] = pd.to_datetime(df['Date'])
start_date = df['Date'].min()
end_date = df['Date'].max()
start_date_download = (pd.to_datetime(start_date) - datetime.timedelta(days=75)).strftime('%Y-%m-%d')

In [8]:
vix_data = yf.download('^VIX', start=start_date_download, end=end_date)
vix_data.rename(columns={'Close': 'VIX_Close'}, inplace=True)
vix_data.columns = vix_data.columns.get_level_values(0)

  vix_data = yf.download('^VIX', start=start_date_download, end=end_date)
[*********************100%***********************]  1 of 1 completed


In [9]:
sp500_data = yf.download('^GSPC', start=start_date_download, end=end_date)
sp500_data.columns = sp500_data.columns.get_level_values(0)

  sp500_data = yf.download('^GSPC', start=start_date_download, end=end_date)
[*********************100%***********************]  1 of 1 completed


In [10]:
sp500_data.head()

Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2007-10-19,1500.630005,1540.0,1500.26001,1540.0,4160970000
2007-10-22,1506.329956,1508.060059,1490.400024,1497.790039,3471830000
2007-10-23,1519.589966,1520.01001,1503.609985,1509.300049,3309120000
2007-10-24,1515.880005,1517.22998,1489.560059,1516.609985,4003300000
2007-10-25,1514.400024,1523.23999,1500.459961,1516.150024,4183960000


In [11]:
sp500_data['MA50'] = sp500_data['Close'].rolling(window=50).mean()


In [12]:
sp500_data_clean = sp500_data.loc[start_date:]

In [13]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date').reset_index(drop=True)

In [14]:
df = pd.merge(df, vix_data[['VIX_Close']], on='Date', how='left')
df = pd.merge(df, sp500_data[['MA50']], on='Date', how='left')

In [15]:
df.head()

Unnamed: 0,Title,Date,CP,VIX_Close,MA50
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02,1447.16,23.17,1479.511599
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02,1447.16,23.17,1479.511599
2,2008 predictions for the S&P 500,2008-01-02,1447.16,23.17,1479.511599
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16,22.49,1478.328201
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18,23.790001,1474.175002


In [16]:
df['Quarter'] = df['Date'].dt.quarter
df['Is_weekend'] = df['Date'].dt.dayofweek.isin([5, 6])
df['Is_month_start'] = df['Date'].dt.is_month_start
df['Is_month_end'] = df['Date'].dt.is_month_end

In [17]:
df['Price_vs_MA50'] = (df['CP'] - df['MA50']) / df['MA50'] * 100

In [18]:
df['Price_Change_Pct'] = df['CP'].pct_change() * 100
def categorize_movement(pct_change):
    if pct_change > 2.0: return 'Drastic Rise'
    elif 0.5 < pct_change <= 2.0: return 'Rise'
    elif -0.5 <= pct_change <= 0.5: return 'Stable'
    elif -2.0 <= pct_change < -0.5: return 'Fall'
    elif pct_change < -2.0: return 'Drastic Fall'
    else: return None
df['Ground_Truth'] = df['Price_Change_Pct'].apply(categorize_movement)

In [19]:
df.head()

Unnamed: 0,Title,Date,CP,VIX_Close,MA50,Quarter,Is_weekend,Is_month_start,Is_month_end,Price_vs_MA50,Price_Change_Pct,Ground_Truth
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02,1447.16,23.17,1479.511599,1,False,False,False,-2.18664,,
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02,1447.16,23.17,1479.511599,1,False,False,False,-2.18664,0.0,Stable
2,2008 predictions for the S&P 500,2008-01-02,1447.16,23.17,1479.511599,1,False,False,False,-2.18664,0.0,Stable
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16,22.49,1478.328201,1,False,False,False,-2.108341,0.0,Stable
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18,23.790001,1474.175002,1,False,False,False,-3.934065,-2.140745,Drastic Fall


In [20]:
def create_input_text(row):
    features = [
        f"Quarter: Q{row['Quarter']}",
        f"VIX: {row['VIX_Close']:.2f}",
        f"Price vs 50-Day Avg: {row['Price_vs_MA50']:.2f}%"
    ]
    if row['Is_weekend']: features.append("Weekend")
    if row['Is_month_start']: features.append("Month_Start")
    if row['Is_month_end']: features.append("Month_End")

    feature_string = ", ".join(features)
    return f"Context: [{feature_string}] | Headline: {row['Title']}"

df['Input_Text'] = df.apply(create_input_text, axis=1)

In [21]:
final_df = df.dropna(subset=['Ground_Truth', 'Input_Text'])
final_df = final_df[['Date', 'Input_Text', 'Ground_Truth']]

In [22]:
final_df.head()

Unnamed: 0,Date,Input_Text,Ground_Truth
1,2008-01-02,"Context: [Quarter: Q1, VIX: 23.17, Price vs 50...",Stable
2,2008-01-02,"Context: [Quarter: Q1, VIX: 23.17, Price vs 50...",Stable
3,2008-01-03,"Context: [Quarter: Q1, VIX: 22.49, Price vs 50...",Stable
4,2008-01-07,"Context: [Quarter: Q1, VIX: 23.79, Price vs 50...",Drastic Fall
5,2008-01-09,"Context: [Quarter: Q1, VIX: 24.12, Price vs 50...",Stable


In [23]:
from sklearn.model_selection import train_test_split
study_set_size = 200
test_proportion = study_set_size / len(final_df)
train_val_df, study_df = train_test_split(final_df, test_size=test_proportion, stratify=final_df['Ground_Truth'], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, stratify=train_val_df['Ground_Truth'], random_state = 42)

In [25]:
train_df.to_csv('training_dataset.csv', index=False)
val_df.to_csv('validation_dataset.csv', index=False)
study_df.to_csv('study_dataset.csv', index=False)