In [34]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# Get the list of S&P 500 tickers
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
tickers = sp500['Symbol'].tolist()

vix = yf.Ticker('^VIX')
vix_df = vix.history(period="15y")
vix_df = vix_df['Close']

# Create an empty list to store the data
data = []
max_days = 0  # Keep track of the maximum number of trading days

# Retrieve market capitalization and P/E ratio for all tickers
market_cap = []
pe_ratio = []
for ticker in tqdm(tickers, desc="Retrieving data"):
    try:
        stock = yf.Ticker(ticker)
        market_cap.append(stock.info['marketCap'])
        pe_ratio.append(stock.info['trailingPE'])
    except KeyError:
        market_cap.append(np.nan)
        pe_ratio.append(np.nan)

# Filter out tickers with missing data
valid_tickers = [ticker for i, ticker in enumerate(tickers) if not np.isnan(market_cap[i]) and not np.isnan(pe_ratio[i])]
market_cap = [cap for i, cap in enumerate(market_cap) if not np.isnan(cap)]
pe_ratio = [ratio for i, ratio in enumerate(pe_ratio) if not np.isnan(ratio)]

# Loop through each valid ticker and download the data
for ticker in tqdm(valid_tickers, desc="Processing tickers"):
    try:
        # Download the data
        stock = yf.Ticker(ticker)
        df = stock.history(period="15y")
        
        # Calculate RSI
        delta = df['Close'].diff()
        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)
        avg_gain = gain.rolling(14).mean()
        avg_loss = loss.rolling(14).mean()
        rs = avg_gain / avg_loss
        rsi = 100 - (100 / (1 + rs))
        df['RSI'] = rsi
        
        # Calculate Exponential Moving Averages
        df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
        df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()
        
        # Add the market capitalization and P/E ratio
        df['Market Cap'] = market_cap[valid_tickers.index(ticker)]
        df['P/E Ratio'] = pe_ratio[valid_tickers.index(ticker)]
        
        # Restructure the data into a tensor format
        tensor_data = df[['Open', 'High', 'Low', 'Close', 'Volume', 'RSI', 'EMA_12', 'EMA_26', 'VIX', 'Market Cap', 'P/E Ratio']].values
        tensor_data = np.expand_dims(tensor_data, axis=1)
        
        # Update the maximum number of trading days
        max_days = max(max_days, tensor_data.shape[2])
        
        data.append(tensor_data)
    except:
        # Skip any tickers that don't have data
        continue

Retrieving data: 100%|██████████| 503/503 [01:21<00:00,  6.18it/s]
Processing tickers: 100%|██████████| 438/438 [01:53<00:00,  3.87it/s]


In [38]:
# tensor structure -> (samples, time steps, features), where samples is the no. of stocks, time steps is the trading days, and features is the data for each day. 

# Assuming the dimensions for uniform_tensor were intended to be:
# (number_of_stocks, max_days, number_of_features)
# and each stock_data in data has shape (days, 1, features) before squeeze

number_of_stocks = len(tensor_data)  # 438 based on your update
number_of_features = 11  # Based on the features you've described
# Ensure max_days is correctly calculated as the maximum days across all datasets

# Reinitialize uniform_tensor in case there's been a mistake in its setup
uniform_tensor = np.full((number_of_stocks, max_days, number_of_features), np.nan)

# Loop through each stock's data in 'data'
for i, stock_data in enumerate(tensor_data):
    # Squeeze to remove the singleton dimension, resulting in shape (days, features)
    stock_data_squeezed = stock_data.squeeze()
    # Number of days corresponds to the first dimension of the squeezed data
    num_days = stock_data_squeezed.shape[0]

    # This operation should work given the shapes involved
    uniform_tensor[i, :num_days, :] = stock_data_squeezed

# Verify the shape of uniform_tensor to ensure it's as expected
print(f"Shape of uniform_tensor: {uniform_tensor.shape}")


Shape of uniform_tensor: (2799, 11, 11)


2799


In [None]:
# Collecting data from FRED

import pandas_datareader as pdr
import datetime

# Download the data
start = datetime.datetime(2000, 1, 1)
end = datetime.datetime(2021, 1, 1)
gdp = pdr.get_data_fred('GDP', start, end)
unemployment = pdr.get_data_fred('UNRATE', start, end)
cpi = pdr.get_data_fred('CPIAUCSL', start, end)
consumer_confidence = pdr.get_data_fred('UMCSENT', start, end)
m1 = pdr.get_data_fred('M1', start, end)
m2 = pdr.get_data_fred('M2', start, end)
ten_year = pdr.get_data_fred('GS10', start, end)
thirty_year = pdr.get_data_fred('GS30', start, end)

# Federal Reserve Bank of St. Louis: Wilshire 5000 Price Index
# Federal Reserve Bank of St. Louis: US Gross Domestic Product