In [1]:
import requests
import pandas as pd
import yfinance as yf
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn


In [2]:
# Load FinBERT
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [3]:
# Get sentiment score
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    score = (probs[0][2] - probs[0][0]).item()  # Positive - Negative
    return score

In [4]:

def fetch_news(api_key, company, from_date, to_date):
    url = f"https://newsapi.org/v2/everything?q={company}&from={from_date}&to={to_date}&language=en&sortBy=publishedAt&apiKey={api_key}"
    response = requests.get(url).json()
    articles = response.get('articles', [])
    df = pd.DataFrame([(a['publishedAt'][:10], a['title'], a['url']) for a in articles],
                      columns=['date', 'headline', 'url'])
    return df

In [5]:
# Aggregate sentiment per date
def sentiment_pipeline(news_df):
    news_df['sentiment_score'] = news_df['headline'].apply(get_sentiment)
    grouped = news_df.groupby('date').agg({
        'sentiment_score': 'mean',
        'url': lambda x: list(x)
    }).reset_index()
    return grouped


In [6]:

# Get macro indices
def get_macro_indices(start, end):
    indices = {'DJI': '^DJI', 'NASDAQ': '^IXIC', 'SP500': '^GSPC'}
    macro_data = pd.DataFrame()
    for name, symbol in indices.items():
        df = yf.download(symbol, start=start, end=end).reset_index()
        df = df[['Date', 'Close']].rename(columns={'Close': name})
        df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
        macro_data = df if macro_data.empty else pd.merge(macro_data, df, on='Date')
    return macro_data

In [7]:
def merge_full_data(stock_df, sentiment_df, macro_df):
    # Flatten multi-level column indexing in stock_df
    if isinstance(stock_df.columns, pd.MultiIndex):
        stock_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in stock_df.columns]

    # Ensure all DataFrames have single-level column indexes
    if isinstance(sentiment_df.columns, pd.MultiIndex):
        sentiment_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in sentiment_df.columns]
    if isinstance(macro_df.columns, pd.MultiIndex):
        macro_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in macro_df.columns]

    # Standardize the 'Date' column format
    stock_df['Date'] = stock_df['Date'].dt.strftime('%Y-%m-%d')

    # Perform the merges
    merged = pd.merge(stock_df, sentiment_df, left_on='Date', right_on='date', how='left')
    merged = pd.merge(merged, macro_df, on='Date', how='left')

    # Fill NaN values with 0
    merged = merged.fillna(0)
    return merged

In [8]:

# Preprocess for LSTM
def preprocess_lstm(data, feature_cols, target_col, sequence_length=10):
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(data[feature_cols + [target_col]])
    sequences, labels = [], []
    for i in range(len(scaled) - sequence_length):
        sequences.append(scaled[i:i+sequence_length])
        labels.append(scaled[i+sequence_length, -1])
    return np.array(sequences), np.array(labels), scaler


In [9]:
# Define LSTM
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=50, num_layers=2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

In [10]:


# Train LSTM
def train_lstm_model(X, y, input_size, epochs=30, batch_size=32):
    model = LSTMModel(input_size)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    dataset = TensorDataset(torch.tensor(X, dtype=torch.float32),
                            torch.tensor(y, dtype=torch.float32).view(-1, 1))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model.train()
    for epoch in range(epochs):
        for batch_X, batch_y in loader:
            output = model(batch_X)
            loss = criterion(output, batch_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
    return model

In [21]:
def buy_sell_signal(sentiment_score):
    if sentiment_score > 0.2:
        return "Buy"
    elif sentiment_score < -0.2:
        return "Sell"
    else:
        return "Hold"

def merge_full_data(stock_df, sentiment_df, macro_df):
    # Ensure datetime format
    stock_df['Date'] = pd.to_datetime(stock_df['Date'])
    sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
    macro_df['Date'] = pd.to_datetime(macro_df['Date'])

    # Reset index to ensure 'Date' is a regular column
    stock_df = stock_df.reset_index()  # This line is added

    # Align sentiment_df to stock dates
    sentiment_df = sentiment_df.set_index('date')
    sentiment_df = sentiment_df.reindex(stock_df['Date'], method='ffill')
    sentiment_df = sentiment_df.reset_index()

    # Now: sentiment_df has 'date' column, stock_df has 'Date'
    # Rename 'date' to 'Date' so merging becomes clean
    sentiment_df.rename(columns={'date': 'Date'}, inplace=True)

    # Merge stock and sentiment
    merged = pd.merge(stock_df, sentiment_df, on='Date', how='left')

    # Merge with macro indices
    merged = pd.merge(merged, macro_df, on='Date', how='left')

    # Fill missing values
    merged['sentiment_score'] = merged['sentiment_score'].fillna(0)
    merged['url'] = merged['url'].apply(lambda x: [] if pd.isnull(x) else x)
    return merged



    print("Merging all datasets...")
    final_df = merge_full_data(stock_df, sentiment_df, macro_df)  # << CALL the function

    print("Preparing data for LSTM...")
    feature_cols = ['Open', 'High', 'Low', 'Volume', 'sentiment_score', 'DJI', 'NASDAQ', 'SP500']
    target_col = 'Close'
    X, y, scaler = preprocess_lstm(final_df, feature_cols, target_col)

    print("Training LSTM model...")
    model = train_lstm_model(X, y, input_size=X.shape[2])

    # Display last few merged records with URLs
    print("\nSample data:")
    print(final_df[['Date', 'Close', 'sentiment_score', 'url', 'DJI', 'NASDAQ', 'SP500']].tail())

    if __name__ == "__main__":
    # Inputs
      api_key = 'YOUR_NEWSAPI_KEY'
      company = 'Tesla'
      ticker = 'TSLA'
      start_date = '2024-04-01'
      end_date = '2024-04-20'

    print("Fetching news...")
    news_df = fetch_news(api_key, company, start_date, end_date)

    print("Analyzing sentiment...")
    sentiment_df = sentiment_pipeline(news_df)

    print("Fetching stock data...")
    stock_df = yf.download(ticker, start=start_date, end=end_date).reset_index()

    print("Fetching macro indices...")
    macro_df = get_macro_indices(start_date, end_date)
    # ... (previous code) ...


In [24]:
stock_df.head()

Price,Date,Close,High,Low,Open,Volume
Ticker,Unnamed: 1_level_1,TSLA,TSLA,TSLA,TSLA,TSLA
0,2024-04-01,175.220001,176.75,170.210007,176.169998,81562100
1,2024-04-02,166.630005,167.690002,163.429993,164.75,116650600
2,2024-04-03,168.380005,168.820007,163.279999,164.020004,82950100
3,2024-04-04,171.110001,177.190002,168.009995,170.070007,123162000
4,2024-04-05,164.899994,170.860001,160.509995,169.080002,141250700


