# Import Libraries

In [None]:

import yfinance as yf
import pandas as pd
import numpy as np
from typing import Dict, Optional
from tqdm import tqdm

# Define Functions

In [2]:
def get_tickers(url):
    """Fetches base data from a given URL and returns it as a DataFrame.
    Args:
        url (str): The URL to fetch the data from.
    Returns:
        pd.DataFrame: A DataFrame containing the fetched data.
    """
    
    df = pd.read_html(url)[0]
    df = df.rename(columns={'Symbol': 'Ticker', 'Security': 'Company_Name', 'GICS Sector': 'Sector', 'GICS Sub-Industry': 'Industry', 'Founded': 'Founded_Year'}).drop(['Date added', 'CIK'], axis=1)
    df.columns = df.columns.str.replace(' ', '_').str.replace('/', '_').str.replace('-', '_')
    df['Ticker'] = df['Ticker'].str.upper()
    
    print(f"Fetched {len(df)} rows from {url}")
    return df['Ticker'].unique().tolist()

In [None]:


def get_stock_data(ticker_symbol: str, period: str = "5y", benchmark_data: Optional[pd.DataFrame] = None) -> Dict:
    """
    Calculate comprehensive stock metrics for a given ticker.
    
    Args:
        ticker_symbol (str): Stock ticker symbol (e.g., 'AAPL')
        period (str): Historical data period (default: '5y')
        benchmark_data (pd.DataFrame): Pre-loaded S&P 500 data for beta calculation
        
    Returns:
        Dict: Dictionary containing all calculated metrics for the ticker
    """
    try:
        ticker = yf.Ticker(ticker_symbol)
        
        # Get historical data
        hist = ticker.history(period=period)
        if hist.empty:
            raise ValueError(f"No data found for {ticker_symbol}")
        
        hist = hist.reset_index()
        hist['Date'] = pd.to_datetime(hist['Date'])
        hist['Year'] = hist['Date'].dt.year
        
        # Get ticker info
        info = ticker.info
        current_year = hist['Year'].max()
        
        # Initialize results dictionary
        stock_data = {'Ticker': ticker_symbol}
        
        # Price-based metrics
        stock_data.update(_calculate_price_metrics(hist))
        
        # Return calculations
        stock_data.update(_calculate_returns(hist, current_year))
        
        # Risk metrics (pass benchmark_data to avoid reloading)
        stock_data.update(_calculate_risk_metrics(hist, benchmark_data))
        
        # Market data
        stock_data.update(_get_market_data(info))
        
        return stock_data
        
    except Exception as e:
        print(f"Error processing {ticker_symbol}: {e}")

def _calculate_price_metrics(hist: pd.DataFrame) -> Dict:
    """Calculate price-related metrics."""
    closing_price = hist['Close'].iloc[-1]
    all_time_high = hist['Close'].max()
    
    # Calculate moving average
    hist['200_Day_Moving_Average'] = hist['Close'].rolling(window=200).mean()
    
    return {
        'Closing_Price': round(closing_price, 2),
        'All_Time_High': round(all_time_high, 2),
        'Percent_From_All_Time_High': round(((closing_price - all_time_high) / all_time_high) * 100, 2),
        'Percent_Difference_200_Day_Moving_Average': round(((closing_price - hist['200_Day_Moving_Average'].iloc[-1]) / hist['200_Day_Moving_Average'].iloc[-1]) * 100, 2),
        '24_Hour_Percent_Change': round(hist['Close'].pct_change(periods=1).iloc[-1] * 100, 2),
        '7_Day_Percent_Change': round(hist['Close'].pct_change(periods=7).iloc[-1] * 100, 2),
        '30_Day_Percent_Change': round(hist['Close'].pct_change(periods=30).iloc[-1] * 100, 2)
    }


def _calculate_returns(hist: pd.DataFrame, current_year: int) -> Dict:
    """Calculate return metrics."""
    returns_data = {}
    
    # Annualized return
    total_years = len(hist['Year'].unique())
    if total_years > 1:
        annualized_return = ((hist['Close'].iloc[-1] / hist['Close'].iloc[0]) ** (1 / total_years) - 1) * 100
        returns_data['Annualized_Return'] = round(annualized_return, 2)
    
    # Yearly returns
    for year in sorted(hist['Year'].unique(), reverse=True):
        year_data = hist[hist['Year'] == year]
        if len(year_data) > 1:
            year_return = ((year_data['Close'].iloc[-1] - year_data['Close'].iloc[0]) / year_data['Close'].iloc[0]) * 100
            
            if year == current_year:
                returns_data['YTD_Return'] = round(year_return, 2)
            else:
                returns_data[f'{year}_Return'] = round(year_return, 2)
    
    return returns_data


def _calculate_risk_metrics(hist: pd.DataFrame, benchmark_data: Optional[pd.DataFrame] = None) -> Dict:
    """Calculate risk-related metrics."""
    # Daily returns
    hist['Daily_Return'] = hist['Close'].pct_change().dropna()
    
    # Volatility (annualized)
    volatility = hist['Daily_Return'].std() * np.sqrt(252)
    
    # Sharpe ratio (assuming 1% risk-free rate)
    risk_free_rate = 0.01
    if len(hist) > 1:
        total_return = (hist['Close'].iloc[-1] / hist['Close'].iloc[0]) - 1
        annualized_return = (1 + total_return) ** (252 / len(hist)) - 1
        excess_return = annualized_return - risk_free_rate
        sharpe_ratio = excess_return / volatility if volatility > 0 else 0
    else:
        sharpe_ratio = 0
    
    # Beta calculation (use pre-loaded benchmark data if available)
    beta = _calculate_beta(hist, benchmark_data)
    
    return {
        'Annualized_Volatility': round(volatility * 100, 2),
        'Sharpe_Ratio': round(sharpe_ratio, 2),
        'Beta': round(beta, 2) if not np.isnan(beta) else None
    }


def _calculate_beta(hist: pd.DataFrame, benchmark_data: Optional[pd.DataFrame] = None) -> float:
    """Calculate beta against S&P 500 using pre-loaded benchmark data."""
    try:
        if benchmark_data is None:
            return np.nan
            
        # Merge on date
        merged = hist.merge(
            benchmark_data[['Date', 'Daily_Return']], 
            on='Date', 
            suffixes=('', '_Benchmark'),
            how='inner'
        )
        
        if len(merged) < 30:  # Need sufficient data points
            return np.nan
        
        # Calculate beta using numpy for speed
        stock_returns = merged['Daily_Return'].dropna()
        benchmark_returns = merged['Daily_Return_Benchmark'].dropna()
        
        if len(stock_returns) == len(benchmark_returns) and len(stock_returns) > 0:
            covariance = np.cov(stock_returns, benchmark_returns)[0, 1]
            benchmark_variance = np.var(benchmark_returns)
            return covariance / benchmark_variance if benchmark_variance != 0 else np.nan
        
        return np.nan
        
    except Exception:
        return np.nan


def _load_benchmark_data(period: str = "5y") -> pd.DataFrame:
    """Load S&P 500 benchmark data once for all calculations."""
    try:
        print("Loading S&P 500 benchmark data...")
        benchmark = yf.Ticker('^GSPC')
        benchmark_hist = benchmark.history(period=period).reset_index()
        benchmark_hist['Date'] = pd.to_datetime(benchmark_hist['Date'])
        benchmark_hist['Daily_Return'] = benchmark_hist['Close'].pct_change()
        return benchmark_hist[['Date', 'Daily_Return']].dropna()
    except Exception as e:
        print(f"Failed to load benchmark data: {e}")
        return pd.DataFrame()


def _get_market_data(info: Dict) -> Dict:
    """Extract market data from ticker info."""
    return {
        'Company_Name': info.get('shortName', '').replace('\n', ' '),
        'Market_Cap': info.get('marketCap'),
        'Sector': info.get('sector'),
        'Industry': info.get('industry'),
        'Country': info.get('country'),
        'Business_Summary': info.get('longBusinessSummary', '').replace('\n', ' '),
        'Dividend_Yield': info.get('dividendYield'),
        'Trailing_PE': info.get('trailingPE'),
        'Forward_PE': info.get('forwardPE'),
        'Average_Volume': info.get('averageVolume'),
        'Average_Volume_10days': info.get('averageVolume10days'),
        '52_Week_Change': info.get('52WeekChange')
    }


def get_multiple_stocks_data(tickers: list, period: str = "5y") -> pd.DataFrame:
    """
    Get stock data for multiple tickers and return as DataFrame.
    
    Args:
        tickers (list): List of ticker symbols
        period (str): Historical data period
        
    Returns:
        pd.DataFrame: DataFrame with stock data for all tickers
    """
    # Pre-load benchmark data once for all stocks
    benchmark_data = _load_benchmark_data(period)
    
    all_data = []

    for ticker in (tickers):
        try:
            stock_data = get_stock_data(ticker)  # your existing function
            if stock_data is not None:  # Only append if data was successfully retrieved
                all_data.append(stock_data)
            else:
                tqdm.write(f"Warning: No data retrieved for {ticker}")
        except Exception as e:
            tqdm.write(f"Error processing {ticker}: {str(e)}")
            continue
            
    df = pd.DataFrame(all_data)

    # Clean up Missing Values
    df['Dividend_Yield'] = df['Dividend_Yield'].fillna(0)
    df['Sector'] = df['Sector'].fillna('Unknown')
    df['Industry'] = df['Industry'].fillna('Unknown') 
    df['Country'] = df['Country'].fillna('Unknown')
    df['Business_Summary'] = df['Business_Summary'].fillna('No description available')
    df = df.fillna(0)

    return df.sort_values('Market_Cap', ascending=False, na_position='last')

# Get List of Tickers

In [4]:
stock_tickers = get_tickers('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
len(stock_tickers)

Fetched 503 rows from https://en.wikipedia.org/wiki/List_of_S%26P_500_companies


503

# Create Dataset with Features

In [5]:
df_enriched_stock_data = get_multiple_stocks_data(stock_tickers)
df_enriched_stock_data

Loading S&P 500 benchmark data...


HTTP Error 401: 
$BRK.B: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Error processing BRK.B: No data found for BRK.B


$BF.B: possibly delisted; no price data found  (period=5y)


Error processing BF.B: No data found for BF.B


HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 
HTTP Error 401: 


Unnamed: 0,Ticker,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change,Annualized_Return,YTD_Return,...,Sector,Industry,Country,Business_Summary,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change
315,MSFT,487.27,487.27,0.00,16.05,2.07,3.10,11.41,16.65,16.86,...,Technology,Software - Infrastructure,United States,Microsoft Corporation develops and supports so...,0.70,37.656105,32.593310,22828664.0,18533110.0,0.066411
345,NVDA,144.28,149.41,-3.43,12.52,0.30,1.02,22.94,57.59,4.33,...,Technology,Semiconductors,United States,"NVIDIA Corporation, a computing infrastructure...",0.03,46.392320,35.019444,251213522.0,173200140.0,0.217932
39,AAPL,201.95,258.40,-21.85,-9.56,0.47,1.59,2.39,14.62,-16.98,...,Technology,Consumer Electronics,United States,"Apple Inc. designs, manufactures, and markets ...",0.52,31.495320,24.294222,61159585.0,55367100.0,-0.034304
22,AMZN,209.62,242.06,-13.40,2.70,-0.03,-1.68,9.13,7.19,-4.81,...,Consumer Cyclical,Internet Retail,United States,"Amazon.com, Inc. engages in the retail sale of...",0.00,34.122150,34.066666,48931716.0,39057090.0,0.129978
19,GOOGL,163.90,205.89,-20.39,-4.40,-1.64,-7.58,6.37,14.49,-13.27,...,Communication Services,Internet Content & Information,United States,Alphabet Inc. offers various products and plat...,0.50,18.278538,18.298939,40432375.0,36300310.0,-0.070193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,ALB,56.83,313.32,-81.86,-29.27,0.30,-12.11,-0.42,-3.84,-32.51,...,Basic Materials,Specialty Chemicals,United States,Albemarle Corporation provides energy storage ...,2.86,,30.885870,3520533.0,3122920.0,-0.412241
254,IVZ,14.88,24.58,-39.45,-7.66,0.48,-1.26,2.26,8.32,-13.58,...,Financial Services,Asset Management,United States,Invesco Ltd. is a publicly owned investment ma...,5.67,11.904560,7.873386,5743362.0,5694980.0,-0.032658
318,MHK,102.00,229.74,-55.60,-18.31,2.48,-2.59,-2.96,-0.30,-12.01,...,Consumer Cyclical,"Furnishings, Fixtures & Appliances",United States,"Mohawk Industries, Inc. designs, manufactures,...",0.00,13.333333,9.147983,800620.0,908320.0,-0.117407
78,CZR,28.26,119.49,-76.35,-16.54,0.39,1.29,1.55,-7.11,-13.29,...,Consumer Cyclical,Resorts & Casinos,United States,"Caesars Entertainment, Inc. operates as a gami...",0.00,,21.089552,5669772.0,5448770.0,-0.278020


# Save to CSV

In [10]:
df_enriched_stock_data.isna().sum()

Ticker                                       0
Closing_Price                                0
All_Time_High                                0
Percent_From_All_Time_High                   0
Percent_Difference_200_Day_Moving_Average    0
24_Hour_Percent_Change                       0
7_Day_Percent_Change                         0
30_Day_Percent_Change                        0
Annualized_Return                            0
YTD_Return                                   0
2024_Return                                  0
2023_Return                                  0
2022_Return                                  0
2021_Return                                  0
2020_Return                                  0
Annualized_Volatility                        0
Sharpe_Ratio                                 0
Beta                                         0
Company_Name                                 0
Market_Cap                                   0
Sector                                       0
Industry     

In [11]:
assert df_enriched_stock_data.isna().sum().sum() == 0, "DataFrame contains NaN values"

In [12]:
df_enriched_stock_data.to_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data/enriched_stock_data.csv', index=False)
print("Enriched stock data saved to 'data/enriched_stock_data.csv'")

Enriched stock data saved to 'data/enriched_stock_data.csv'
