# Import Libraries

In [1]:
import pandas as pd
import requests
import yfinance as yf
import numpy as np
from pathlib import Path

# Get Data
### We get the quarterly financial and stock price data from yahoo finance.
### We will only keep the companies in the S&P500 with data since 2020-01-01

In [3]:
# URL of the Wikipedia page containing the S&P 500 companies
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies";

# Define HTTP headers to simulate a real browser
# This helps avoid being blocked by the website (HTTP 403)
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/117.0.0.0 Safari/537.36"
    )
}

# Send an HTTP GET request with custom headers
response = requests.get(url, headers=headers)

# Raise an exception if the request failed (status code not 200)
response.raise_for_status()

# Parse all HTML tables from the downloaded page
# pandas.read_html can parse tables from a string of HTML
tables = pd.read_html(response.text)

# The first table on the page contains the S&P 500 companies
sp500_table = tables[0]

# Extract the 'Symbol' column to get the list of tickers
tickers = sp500_table['Symbol'].tolist()

# Print the first 10 tickers as a sanity check
print(tickers[:10])
print (f'Number of Tickers: {len(tickers)}')


  tables = pd.read_html(response.text)


['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']
Number of Tickers: 503


In [4]:
#We download closing price of all tickers since 2020-01-01
data = yf.download(tickers, start='2020-01-01')['Close']

print(f'data shape: {data.shape}')

#We keep only tickers that have data since 2020-01-01
data = data.dropna(axis=1)

#We store the tickers in the DataFrame 'data'
tickers = data.columns.tolist()

print(f'data shape: {data.shape}')

print(f'Number of tickers with closing price data since 2020-01-01 to current date: {len(tickers)}')

#We stack the values in 'data' to reorganize the DataFrame so we have columns 'Date', 'Ticker', 'close_stock_price'
data = pd.DataFrame(data.stack()).reset_index().sort_values(['Ticker', 'Date'])
data.rename(columns={0: 'close_stock_price'}, inplace=True)

print(f'data shape: {data.shape}')

  data = yf.download(tickers, start='2020-01-01')['Close']
[*********************100%***********************]  503 of 503 completed

2 Failed downloads:
['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')
['BF.B']: YFPricesMissingError('possibly delisted; no price data found  (1d 2020-01-01 -> 2025-12-20)')


data shape: (1501, 503)
data shape: (1501, 451)
Number of tickers with closing price data since 2020-01-01 to current date: 451
data shape: (676951, 3)


In [5]:
# We create a financials dataframe with all quarterly financial statements of tickers with closing price data since 2020-01-01

financials = pd.DataFrame()

for tkr in tickers:
    ticker = yf.Ticker(tkr)
    # Financial statements (quarterly)
    income_q = ticker.quarterly_income_stmt     # Income statement, quarterly
    balance_q = ticker.quarterly_balance_sheet  # Balance sheet, quarterly
    cashflow_q = ticker.quarterly_cashflow      # Cash flow, quarterly

    #We combine all the financial data transposed (dates as row index and financials as columns)
    financials_tkr = pd.concat([income_q.T, balance_q.T, cashflow_q.T], axis = 1)

    financials_tkr.reset_index(inplace=True)        #We reset the index
    financials_tkr.columns.values[0] = 'Date'       #We rename the 1st column (old index) 'Date'
    financials_tkr.insert(0, 'Ticker', tkr)         # Insert a column 'Ticker' at the beginning of the DataFrame to identify the company
    
    financials = pd.concat([financials, financials_tkr], axis=0)

print(f'financials DataFrame shape: {financials.shape}')

financials DataFrame shape: (3118, 333)


In [6]:
#We merge the DataFrames 'financials' and 'data' to add the 'close_stock_price' info to the 'financials' data
#First we get sure that data is in the correct format
financials['Date'] = pd.to_datetime(financials['Date'])
data['Date'] = pd.to_datetime(data['Date'])
financials['Ticker'] = financials['Ticker'].astype(str)
data['Ticker'] = data['Ticker'].astype(str)

#We merge both dataframes
financials = financials.merge(
    data,  # columnas que quieres traer
    on=['Ticker', 'Date'],  # columnas clave
    how='left'              # left join para mantener todas las filas de financials
)

# We rename the index with ticker and date info so each observation is easier to track in the future
financials = financials.set_index(
    financials['Ticker'] + "_" + financials['Date'].dt.strftime('%Y-%m-%d')
)

# We save the data as a pickle in the folder 'data'

In [7]:
#We save the 'financials' DataFrame as a pickle with the name raw_financials.pkl in the 'data' folder
#We save the data folder path or create it if it doesn't exist
project_path = Path('..')          
data_folder = project_path / 'data' 
data_folder.mkdir(exist_ok=True) 

#We save the DataFrame in pikle format
file_path = data_folder / 'raw_financials.pkl'
financials.to_pickle(file_path)

print(f"DataFrame saved successfully in {file_path}")


DataFrame saved successfully in ../data/raw_financials.pkl
