# Import Libraries

In [2]:

import yfinance as yf
import pandas as pd
import numpy as np
from typing import Dict, Optional
from tqdm import tqdm

# Define Functions

In [3]:
def get_tickers(url):
    """Fetches base data from a given URL and returns it as a DataFrame.
    Args:
        url (str): The URL to fetch the data from.
    Returns:
        pd.DataFrame: A DataFrame containing the fetched data.
    """
    
    df = pd.read_html(url)[0]
    df = df.rename(columns={'Symbol': 'Ticker', 'Security': 'Company_Name', 'GICS Sector': 'Sector', 'GICS Sub-Industry': 'Industry', 'Founded': 'Founded_Year'}).drop(['Date added', 'CIK'], axis=1)
    df.columns = df.columns.str.replace(' ', '_').str.replace('/', '_').str.replace('-', '_')
    df['Ticker'] = df['Ticker'].str.upper()
    
    print(f"Fetched {len(df)} rows from {url}")
    return df['Ticker'].unique().tolist()

In [4]:
def get_stock_data(ticker_symbol: str, period: str = "5y", benchmark_data: Optional[pd.DataFrame] = None) -> Dict:
    """
    Calculate comprehensive stock metrics for a given ticker.
    
    Args:
        ticker_symbol (str): Stock ticker symbol (e.g., 'AAPL')
        period (str): Historical data period (default: '5y')
        benchmark_data (pd.DataFrame): Pre-loaded S&P 500 data for beta calculation
        
    Returns:
        Dict: Dictionary containing all calculated metrics for the ticker
    """
    try:
        ticker = yf.Ticker(ticker_symbol)
        
        # Get historical data
        hist = ticker.history(period=period)
        if hist.empty:
            raise ValueError(f"No data found for {ticker_symbol}")
        
        hist = hist.reset_index()
        hist['Date'] = pd.to_datetime(hist['Date'])
        hist['Year'] = hist['Date'].dt.year
        
        # Get ticker info
        info = ticker.info
        current_year = hist['Year'].max()
        
        # Initialize results dictionary
        stock_data = {'Ticker': ticker_symbol}
        
        # Price-based metrics
        stock_data.update(_calculate_price_metrics(hist))
        
        # Return calculations
        stock_data.update(_calculate_returns(hist, current_year))
        
        # Risk metrics (pass benchmark_data to avoid reloading)
        stock_data.update(_calculate_risk_metrics(hist, benchmark_data))
        
        # Market data
        stock_data.update(_get_market_data(info))
        
        return stock_data
        
    except Exception as e:
        print(f"Error processing {ticker_symbol}: {e}")

def _calculate_price_metrics(hist: pd.DataFrame) -> Dict:
    """Calculate price-related metrics."""
    closing_price = hist['Close'].iloc[-1]
    all_time_high = hist['Close'].max()
    
    # Calculate moving average
    hist['200_Day_Moving_Average'] = hist['Close'].rolling(window=200).mean()
    
    return {
        'Closing_Price': round(closing_price, 2),
        'All_Time_High': round(all_time_high, 2),
        'Percent_From_All_Time_High': round(((closing_price - all_time_high) / all_time_high) * 100, 2),
        'Percent_Difference_200_Day_Moving_Average': round(((closing_price - hist['200_Day_Moving_Average'].iloc[-1]) / hist['200_Day_Moving_Average'].iloc[-1]) * 100, 2),
        '24_Hour_Percent_Change': round(hist['Close'].pct_change(periods=1).iloc[-1] * 100, 2),
        '7_Day_Percent_Change': round(hist['Close'].pct_change(periods=7).iloc[-1] * 100, 2),
        '30_Day_Percent_Change': round(hist['Close'].pct_change(periods=30).iloc[-1] * 100, 2)
    }


def _calculate_returns(hist: pd.DataFrame, current_year: int) -> Dict:
    """Calculate return metrics."""
    returns_data = {}
    
    # Annualized return
    total_years = len(hist['Year'].unique())
    if total_years > 1:
        annualized_return = ((hist['Close'].iloc[-1] / hist['Close'].iloc[0]) ** (1 / total_years) - 1) * 100
        returns_data['Annualized_Return'] = round(annualized_return, 2)
    
    # Yearly returns
    for year in sorted(hist['Year'].unique(), reverse=True):
        year_data = hist[hist['Year'] == year]
        if len(year_data) > 1:
            year_return = ((year_data['Close'].iloc[-1] - year_data['Close'].iloc[0]) / year_data['Close'].iloc[0]) * 100
            
            if year == current_year:
                returns_data['YTD_Return'] = round(year_return, 2)
            else:
                returns_data[f'{year}_Return'] = round(year_return, 2)
    
    return returns_data


def _calculate_risk_metrics(hist: pd.DataFrame, benchmark_data: Optional[pd.DataFrame] = None) -> Dict:
    """Calculate risk-related metrics."""
    # Daily returns
    hist['Daily_Return'] = hist['Close'].pct_change().dropna()
    
    # Volatility (annualized)
    volatility = hist['Daily_Return'].std() * np.sqrt(252)
    
    # Sharpe ratio (assuming 1% risk-free rate)
    risk_free_rate = 0.01
    if len(hist) > 1:
        total_return = (hist['Close'].iloc[-1] / hist['Close'].iloc[0]) - 1
        annualized_return = (1 + total_return) ** (252 / len(hist)) - 1
        excess_return = annualized_return - risk_free_rate
        sharpe_ratio = excess_return / volatility if volatility > 0 else 0
    else:
        sharpe_ratio = 0
    
    # Beta calculation (use pre-loaded benchmark data if available)
    beta = _calculate_beta(hist, benchmark_data)
    
    return {
        'Annualized_Volatility': round(volatility * 100, 2),
        'Sharpe_Ratio': round(sharpe_ratio, 2),
        'Beta': round(beta, 2) if not np.isnan(beta) else None
    }


def _calculate_beta(hist: pd.DataFrame, benchmark_data: Optional[pd.DataFrame] = None) -> float:
    """Calculate beta against S&P 500 using pre-loaded benchmark data."""
    try:
        if benchmark_data is None:
            return np.nan
            
        # Merge on date
        merged = hist.merge(
            benchmark_data[['Date', 'Daily_Return']], 
            on='Date', 
            suffixes=('', '_Benchmark'),
            how='inner'
        )
        
        if len(merged) < 30:  # Need sufficient data points
            return np.nan
        
        # Calculate beta using numpy for speed
        stock_returns = merged['Daily_Return'].dropna()
        benchmark_returns = merged['Daily_Return_Benchmark'].dropna()
        
        if len(stock_returns) == len(benchmark_returns) and len(stock_returns) > 0:
            covariance = np.cov(stock_returns, benchmark_returns)[0, 1]
            benchmark_variance = np.var(benchmark_returns)
            return covariance / benchmark_variance if benchmark_variance != 0 else np.nan
        
        return np.nan
        
    except Exception:
        return np.nan


def _load_benchmark_data(period: str = "5y") -> pd.DataFrame:
    """Load S&P 500 benchmark data once for all calculations."""
    try:
        print("Loading S&P 500 benchmark data...")
        benchmark = yf.Ticker('^GSPC')
        benchmark_hist = benchmark.history(period=period).reset_index()
        benchmark_hist['Date'] = pd.to_datetime(benchmark_hist['Date'])
        benchmark_hist['Daily_Return'] = benchmark_hist['Close'].pct_change()
        return benchmark_hist[['Date', 'Daily_Return']].dropna()
    except Exception as e:
        print(f"Failed to load benchmark data: {e}")
        return pd.DataFrame()


def _get_market_data(info: Dict) -> Dict:
    """Extract market data from ticker info."""
    return {
        'Company_Name': info.get('shortName', '').replace('\n', ' '),
        'Market_Cap': info.get('marketCap'),
        'Sector': info.get('sector'),
        'Industry': info.get('industry'),
        'Country': info.get('country'),
        'Business_Summary': info.get('longBusinessSummary', '').replace('\n', ' '),
        'Dividend_Yield': info.get('dividendYield'),
        'Trailing_PE': info.get('trailingPE'),
        'Forward_PE': info.get('forwardPE'),
        'Average_Volume': info.get('averageVolume'),
        'Average_Volume_10days': info.get('averageVolume10days'),
        '52_Week_Change': info.get('52WeekChange')
    }


def get_multiple_stocks_data(tickers: list, period: str = "5y") -> pd.DataFrame:
    """
    Get stock data for multiple tickers and return as DataFrame.
    
    Args:
        tickers (list): List of ticker symbols
        period (str): Historical data period
        
    Returns:
        pd.DataFrame: DataFrame with stock data for all tickers
    """
    # Pre-load benchmark data once for all stocks
    benchmark_data = _load_benchmark_data(period)
    
    all_data = []

    for ticker in (tickers):
        try:
            stock_data = get_stock_data(ticker)  # your existing function
            if stock_data is not None:  # Only append if data was successfully retrieved
                tqdm.write(f'Processing {ticker}')
                all_data.append(stock_data)
            else:
                tqdm.write(f"Warning: No data retrieved for {ticker}")
        except Exception as e:
            tqdm.write(f"Error processing {ticker}: {str(e)}")
            continue
            
    df = pd.DataFrame(all_data)

    # Clean up Missing Values
    df['Dividend_Yield'] = df['Dividend_Yield'].fillna(0)
    df['Sector'] = df['Sector'].fillna('Unknown')
    df['Industry'] = df['Industry'].fillna('Unknown') 
    df['Country'] = df['Country'].fillna('Unknown')
    df['Business_Summary'] = df['Business_Summary'].fillna('No description available')
    df = df.fillna(0)

    return df.sort_values('Market_Cap', ascending=False, na_position='last')

# Get List of Tickers

In [5]:
stock_tickers = get_tickers('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
len(stock_tickers)

Fetched 503 rows from https://en.wikipedia.org/wiki/List_of_S%26P_500_companies


503

# Create Dataset with Features

In [6]:
df_enriched_stock_data = get_multiple_stocks_data(stock_tickers)
df_enriched_stock_data

Loading S&P 500 benchmark data...
Processing MMM
Processing AOS
Processing ABT
Processing ABBV
Processing ACN
Processing ADBE
Processing AMD
Processing AES
Processing AFL
Processing A


  df = df.fillna(0)


Unnamed: 0,Ticker,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change,Annualized_Return,YTD_Return,...,Sector,Industry,Country,Business_Summary,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change
3,ABBV,186.79,214.68,-12.99,0.3,0.76,-2.13,-0.64,15.52,6.05,...,Healthcare,Drug Manufacturers - General,United States,"AbbVie Inc., a research-based biopharmaceutica...",3.54,79.82478,15.39901,7459630,5302650,0.097047
6,AMD,143.68,211.38,-32.03,15.8,0.2,13.68,27.76,19.12,19.11,...,Technology,Semiconductors,United States,"Advanced Micro Devices, Inc. operates as a sem...",0.0,104.11594,28.172548,46025614,65195480,-0.100771
2,ABT,133.67,139.57,-4.23,8.55,-2.71,-0.25,3.58,8.6,19.0,...,Healthcare,Medical Devices,United States,"Abbott Laboratories, together with its subsidi...",1.72,17.382315,25.905039,6768517,5978950,0.312948
4,ACN,296.08,396.28,-25.29,-12.41,0.5,-5.81,-8.2,6.98,-14.34,...,Technology,Information Technology Services,Ireland,Accenture plc provides strategy and consulting...,2.01,23.53577,21.043354,3400690,4247030,-0.028332
5,ADBE,384.95,688.37,-44.08,-13.59,-0.67,-4.18,-3.13,-1.61,-12.71,...,Technology,Software - Application,United States,"Adobe Inc., together with its subsidiaries, op...",0.0,24.644688,18.73236,3795824,5928380,-0.291188
0,MMM,150.76,154.36,-2.33,8.9,2.04,3.92,0.5,5.84,17.38,...,Industrials,Conglomerates,United States,3M Company provides diversified technology ser...,1.98,18.798004,19.083544,3586287,2702470,0.432102
8,AFL,103.91,113.48,-8.43,-1.97,1.4,1.16,-0.52,22.05,2.65,...,Financial Services,Insurance - Life,United States,"Aflac Incorporated, through its subsidiaries, ...",2.26,16.160187,14.49233,2165920,2534880,0.149394
9,A,119.42,174.78,-31.68,-6.97,0.67,0.34,3.47,5.92,-10.31,...,Healthcare,Diagnostics & Research,United States,"Agilent Technologies, Inc. provides applicatio...",0.84,29.413794,21.136282,2265864,1843370,-0.093119
1,AOS,64.67,90.3,-28.39,-7.95,-0.57,0.54,-7.77,7.26,-2.97,...,Industrials,Specialty Industrial Machinery,United States,A. O. Smith Corporation manufactures and marke...,2.09,18.064245,15.889434,1717848,1568580,-0.207023
7,AES,10.81,26.33,-58.94,-14.36,2.95,-5.67,-13.86,-1.78,-14.34,...,Utilities,Utilities - Diversified,United States,"The AES Corporation, together with its subsidi...",6.64,5.875,5.325123,16131209,16375620,-0.449974


In [7]:
df_enriched_stock_data.columns

Index(['Ticker', 'Closing_Price', 'All_Time_High',
       'Percent_From_All_Time_High',
       'Percent_Difference_200_Day_Moving_Average', '24_Hour_Percent_Change',
       '7_Day_Percent_Change', '30_Day_Percent_Change', 'Annualized_Return',
       'YTD_Return', '2024_Return', '2023_Return', '2022_Return',
       '2021_Return', '2020_Return', 'Annualized_Volatility', 'Sharpe_Ratio',
       'Beta', 'Company_Name', 'Market_Cap', 'Sector', 'Industry', 'Country',
       'Business_Summary', 'Dividend_Yield', 'Trailing_PE', 'Forward_PE',
       'Average_Volume', 'Average_Volume_10days', '52_Week_Change'],
      dtype='object')

In [20]:
def create_summary_chart(df):
    df_result = df.copy()
    
    # Create the 'Risk' column based on the specified conditions
    def determine_Risk_signal(row):
        all_time_high_condition = row['Percent_From_All_Time_High'] < -10
        moving_avg_condition = row['Percent_Difference_200_Day_Moving_Average'] < 0
        
        # If Percent_From_All_Time_High is greater than -10%, then 'High'
        if row['Percent_From_All_Time_High'] >= -10:
            return 'High'
        
        # If both conditions are met (< -10% and negative moving avg), then 'Low'
        if all_time_high_condition and moving_avg_condition:
            return 'Low'
        
        # If only one condition is met, then 'Medium'
        if all_time_high_condition or moving_avg_condition:
            return 'Medium'
        
        # Default case (shouldn't reach here based on logic, but safety)
        return 'High'
    
    # Apply the Risk signal logic
    df_result['Risk'] = df_result.apply(determine_Risk_signal, axis=1)
    
    # Get top 25 by Market Cap first
    df_top_25 = df_result.sort_values('Market_Cap', ascending=False).head(25)
    
    # Create categorical ordering for Risk column
    Risk_order = ['Low', 'Medium', 'High']
    df_top_25['Risk'] = pd.Categorical(df_top_25['Risk'], categories=Risk_order, ordered=True)
    
    # Sort by Risk column (Low, Medium, High), then by Market Cap descending
    df_sorted = df_top_25.sort_values(['Risk', 'Market_Cap'], ascending=[True, False])
    
    # Return the sorted dataframe with selected columns
    return df_sorted[
        ['Ticker', 'Company_Name', 'Market_Cap', 'Closing_Price', 'All_Time_High', 'Risk',
         'Percent_From_All_Time_High', 'Percent_Difference_200_Day_Moving_Average', 
         '24_Hour_Percent_Change', '7_Day_Percent_Change', '30_Day_Percent_Change']
    ].reset_index(drop=True)

In [21]:
create_summary_chart(df_enriched_stock_data)

Unnamed: 0,Ticker,Company_Name,Market_Cap,Closing_Price,All_Time_High,Risk,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change
0,ACN,Accenture plc,184414306304,296.08,396.28,Low,-25.29,-12.41,0.5,-5.81,-8.2
1,ADBE,Adobe Inc.,164065689600,384.95,688.37,Low,-44.08,-13.59,-0.67,-4.18,-3.13
2,A,"Agilent Technologies, Inc.",33923041280,119.42,174.78,Low,-31.68,-6.97,0.67,0.34,3.47
3,AOS,A.O. Smith Corporation,9190188032,64.67,90.3,Low,-28.39,-7.95,-0.57,0.54,-7.77
4,AES,The AES Corporation,7695887872,10.81,26.33,Low,-58.94,-14.36,2.95,-5.67,-13.86
5,ABBV,AbbVie Inc.,329945841664,186.79,214.68,Medium,-12.99,0.3,0.76,-2.13,-0.64
6,AMD,"Advanced Micro Devices, Inc.",232962752512,143.68,211.38,Medium,-32.03,15.8,0.2,13.68,27.76
7,ABT,Abbott Laboratories,232564408320,133.67,139.57,High,-4.23,8.55,-2.71,-0.25,3.58
8,MMM,3M Company,81136164864,150.76,154.36,High,-2.33,8.9,2.04,3.92,0.5
9,AFL,AFLAC Incorporated,56178421760,103.91,113.48,High,-8.43,-1.97,1.4,1.16,-0.52


# Save to CSV

In [None]:
df_enriched_stock_data.isna().sum()

In [None]:
assert df_enriched_stock_data.isna().sum().sum() == 0, "DataFrame contains NaN values"

In [None]:
df_enriched_stock_data.to_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data/enriched_stock_data.csv', index=False)
print("Enriched stock data saved to 'data/enriched_stock_data.csv'")