In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import plotly.graph_objects as go
from sklearn.utils import class_weight
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

In [8]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load your CSV
df = pd.read_csv("sp500_headlines_2008_2024.csv")

# Initialize VADER
analyzer = SentimentIntensityAnalyzer()

# Apply VADER to each headline
df['Sentiment'] = df['Title'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

# Now you have a sentiment score from -1 (very negative) to +1 (very positive)
print(df[['Title', 'Sentiment']].head(100))


                                                Title  Sentiment
0    JPMorgan Predicts 2008 Will Be "Nothing But Net"     0.0000
1   Dow Tallies Biggest First-session-of-year Poin...    -0.2732
2                    2008 predictions for the S&P 500     0.0000
3   U.S. Stocks Higher After Economic Data, Monsan...     0.0000
4   U.S. Stocks Climb As Hopes Increase For More F...     0.6249
5   How Investing in Intangibles -- Like Employee ...     0.6597
6          Head And Shoulders Top Bodes Ill For Bulls    -0.2500
7   U.S. Stocks Zigzag Higher As Bernanke Speech S...     0.0000
8   It's a Black Monday as stock markets tank in e...     0.0000
9       U.S. Stocks Largely Recover From Early Plunge     0.0000
10     U.S. Stocks Sink; Dow Off More Than 180 Points     0.0000
11  Marriage - Tracy Collier and John Helvey | Com...     0.0000
12  Former Connecticut hedge fund exec sues in son...     0.0000
13                    Super Bowl Ideas That Can Score     0.5994
14                   New 

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_finbert_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    labels = ['negative', 'neutral', 'positive']
    predicted = labels[torch.argmax(probs)]
    score = probs[0].tolist()
    return predicted, score[0], score[2]  # label, neg, pos

# Apply to your DataFrame
df = pd.read_csv("sp500_headlines_2008_2024.csv").head(100)
results = df['Title'].apply(lambda x: get_finbert_sentiment(str(x)))
df[['FinBERT_Label', 'FinBERT_Neg', 'FinBERT_Pos']] = pd.DataFrame(results.tolist(), index=df.index)

print(df[['Title', 'FinBERT_Label', 'FinBERT_Pos']].head(100))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


                                                Title FinBERT_Label  \
0    JPMorgan Predicts 2008 Will Be "Nothing But Net"      negative   
1   Dow Tallies Biggest First-session-of-year Poin...       neutral   
2                    2008 predictions for the S&P 500      negative   
3   U.S. Stocks Higher After Economic Data, Monsan...       neutral   
4   U.S. Stocks Climb As Hopes Increase For More F...       neutral   
..                                                ...           ...   
95         Stocks Have Miserable Day After House Vote      positive   
96  U.S. Stocks Plunge As Global Credit Crisis Spr...      positive   
97                         Burned by the Dow? Hang on      negative   
98                   S&P 500 Review - The Big Picture      negative   
99  U.S. Stocks Fall Sharply For Fifth Day; S&P 50...      positive   

     FinBERT_Pos  
0   7.417979e-03  
1   4.706256e-01  
2   3.724472e-04  
3   4.519555e-08  
4   2.223808e-05  
..           ...  
95  9.956962e-

In [6]:
pd.set_option('display.max_rows', None)  # Show all rows
print(df[['Title', 'FinBERT_Label', 'FinBERT_Pos']])

                                                Title FinBERT_Label  \
0    JPMorgan Predicts 2008 Will Be "Nothing But Net"      negative   
1   Dow Tallies Biggest First-session-of-year Poin...       neutral   
2                    2008 predictions for the S&P 500      negative   
3   U.S. Stocks Higher After Economic Data, Monsan...       neutral   
4   U.S. Stocks Climb As Hopes Increase For More F...       neutral   
5   How Investing in Intangibles -- Like Employee ...      negative   
6          Head And Shoulders Top Bodes Ill For Bulls      positive   
7   U.S. Stocks Zigzag Higher As Bernanke Speech S...      negative   
8   It's a Black Monday as stock markets tank in e...      negative   
9       U.S. Stocks Largely Recover From Early Plunge       neutral   
10     U.S. Stocks Sink; Dow Off More Than 180 Points      positive   
11  Marriage - Tracy Collier and John Helvey | Com...      negative   
12  Former Connecticut hedge fund exec sues in son...      negative   
13    

In [None]:
# bra för att kunna jämföra

import fear_and_greed

fear_and_greed.get()

In [9]:
# Load the S&P 500 index data
tickers = yf.Tickers('^GSPC')
tickers.tickers['^GSPC'].info
yf.download(['^GSPC'], period='6mo')

  yf.download(['^GSPC'], period='6mo')
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-01-02,5868.549805,5935.089844,5829.529785,5903.259766,3621680000
2025-01-03,5942.470215,5949.339844,5888.660156,5891.069824,3667340000
2025-01-06,5975.379883,6021.040039,5960.009766,5982.810059,4940120000
2025-01-07,5909.029785,6000.680176,5890.680176,5993.259766,4517330000
2025-01-08,5918.25,5927.890137,5874.779785,5910.660156,4441740000
2025-01-10,5827.040039,5890.350098,5807.779785,5890.350098,4751930000
2025-01-13,5836.220215,5838.609863,5773.310059,5782.02002,4421200000
2025-01-14,5842.910156,5871.919922,5805.419922,5859.27002,4142280000
2025-01-15,5949.910156,5960.609863,5905.209961,5905.209961,4544570000
2025-01-16,5937.339844,5964.689941,5930.720215,5963.609863,4285810000


In [None]:
def load_data():
    df = yf.download("^GSPC", start="2024-01-01", end="2024-12-31")
    features = df[['Close', 'Open', 'Volume', 'High', 'Low']].dropna()
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(features)
    return scaled, scaler

data, scaler = load_data()

In [None]:
def create_directional_sequences(data, seq_length=60):
    X, y = [], []
    for i in range(seq_length, len(data) - 1):
        X.append(data[i - seq_length:i])
        
        # Compare only the 'Close' price, which is column index 0
        label = 1 if data[i + 1][0] > data[i][0] else 0
        y.append(label)
        
    return np.array(X), np.array(y)

X, y = create_directional_sequences(data)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)


In [None]:
def build_classifier(input_shape):
    model = Sequential([
        SimpleRNN(50, activation='relu', input_shape=input_shape, return_sequences=True),
        SimpleRNN(20, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


model = build_classifier((X.shape[1], X.shape[2]))


In [None]:
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))

# Fit with class weights
model.fit(X_train, y_train, epochs=10, batch_size=32, class_weight=class_weights_dict)


In [None]:
# Take the last 60 rows and keep all features
latest_sequence = data[-60:].reshape(1, 60, data.shape[1])  # <- fix here

# Predict
prob_up = model.predict(latest_sequence)[0][0]

# Output
direction = "UP" if prob_up > 0.5 else "DOWN"
print(f"Prediction for tomorrow: {direction} (probability: {prob_up:.2f})")



In [None]:
y_probs = model.predict(X_test).flatten()
y_pred = (y_probs > 0.5).astype(int)  # Convert probabilities to 0/1


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# Detailed Report
print(classification_report(y_test, y_pred, target_names=["Down", "Up"]))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.plot(y_test, label='Actual Direction', alpha=0.6)
plt.plot(y_pred, label='Predicted Direction', alpha=0.6)
plt.title("Predicted vs. Actual Market Direction")
plt.xlabel("Time")
plt.ylabel("Direction (1 = Up, 0 = Down)")
plt.legend()
plt.grid(True)
plt.show()
