# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm
import yfinance as yf
import lxml    


# Define Function

In [2]:
def calculate_annualized_returns(ticker_symbol, df_returns, period="5y"):
    """Calculate annualized returns for a given ticker over a specified period.
    Args:
        ticker (yfinance.Ticker): The ticker object for the stock.
        period (str): The period over which to calculate returns (default is "5y").
    Returns:
        pd.Series: A series of annualized returns.
    """

    try:
        ticker = yf.Ticker(ticker_symbol)

        # Get historical market data
        hist = ticker.history(period=period).reset_index()
        hist['Date'] = pd.to_datetime(hist['Date'])
        hist['Year'] = hist['Date'].dt.year
        current_year = hist['Year'].max()    

        
        # average annualized return
        annualized_return = ((hist['Close'].iloc[-1] / hist['Close'].iloc[0]) ** (1 / (current_year - hist['Year'].min())) - 1) * 100
        df_returns.loc[df_returns['Ticker'] == ticker_symbol, 'Annualized_Return'] = np.round(annualized_return, 2)

        # Calculate Percent Returns
        for year in hist['Year'].sort_values(ascending=False).unique():
            if year == current_year:
                ytd_return = ((hist[hist['Year'] == year]['Close'].iloc[-1] - hist[hist['Year'] == year]['Close'].iloc[0]) / hist[hist['Year'] == year]['Close'].iloc[0]) * 100
                df_returns.loc[df_returns['Ticker'] == ticker_symbol, 'YTD_Pct_Return'] = np.round(ytd_return, 2)
            elif year < current_year:
                annual_return = ((hist[hist['Year'] == year]['Close'].iloc[-1] - hist[hist['Year'] == year]['Close'].iloc[0]) / hist[hist['Year'] == year]['Close'].iloc[0]) * 100
                df_returns.loc[df_returns['Ticker'] == ticker_symbol, f'{year}_Pct_Return'] = np.round(annual_return, 2)
        
        # Get market cap
        df_returns.loc[df_returns['Ticker'] == ticker_symbol, 'Market_Cap'] = ticker.info.get('marketCap', np.nan)
        
        # Calculate 200 Day Moving Average & Pct Difference from it
        hist['200_MA'] = hist['Close'].rolling(window=200).mean()
        hist['Pct_Diff_200_MA'] = ((hist['Close'] - hist['200_MA']) / hist['200_MA']) * 100
        df_returns.loc[df_returns['Ticker'] == ticker_symbol, 'Pct_Diff_200_MA'] = np.round(hist['Pct_Diff_200_MA'].iloc[-1], 2)
        
        # Calculate Volatility
        hist['Daily_Return'] = hist['Close'].pct_change()
        mean_daily_return = hist['Daily_Return'].mean()
        volatility = (((hist['Daily_Return'] - mean_daily_return) ** 2).mean() ** 0.5) * np.sqrt(252)  # Annualize the volatility
        df_returns.loc[df_returns['Ticker'] == ticker_symbol, 'Annualized_Volatility'] = np.round(volatility, 2)

        # Calculate Sharpe Ratio
        risk_free_rate = 0.01  # Assuming a risk-free rate of 1%
        sharpe_ratio = (annualized_return - risk_free_rate) / volatility
        df_returns.loc[df_returns['Ticker'] == ticker_symbol, 'Sharpe_Ratio'] = np.round(sharpe_ratio, 2)

        # Calculate Beta
        benchmark_ticker = '^GSPC'  # S&P 500 as benchmark
        benchmark = yf.Ticker(benchmark_ticker)
        benchmark_hist = benchmark.history(period=period).reset_index()
        benchmark_hist['Date'] = pd.to_datetime(benchmark_hist['Date'])
        benchmark_hist['Daily_Return'] = benchmark_hist['Close'].pct_change()
        hist = hist.merge(benchmark_hist[['Date', 'Daily_Return']], on='Date', suffixes=('', '_Benchmark'))
        covariance = hist['Daily_Return'].cov(hist['Daily_Return_Benchmark'])
        benchmark_variance = hist['Daily_Return_Benchmark'].var()
        beta = covariance / benchmark_variance if benchmark_variance != 0 else np.nan
        df_returns.loc[df_returns['Ticker'] == ticker_symbol, 'Beta'] = np.round(beta, 2)

        # Years since founded
        df_returns.loc[df_returns['Ticker'] == ticker_symbol, 'Years_Since_Founded'] = current_year - int(df_returns[df_returns['Ticker'] == ticker_symbol]['Founded'].max()[:4])

        return df_returns
    
    except Exception as e:
        print(f"Error processing {ticker_symbol}: {e}")
        return df_returns

# Get List of Tickers

In [3]:
df_sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0].rename(columns={'Symbol': 'Ticker'})
df_sp500

Unnamed: 0,Ticker,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...
498,XYL,Xylem Inc.,Industrials,Industrial Machinery & Supplies & Components,"White Plains, New York",2011-11-01,1524472,2011
499,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
500,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927


# Create Dataset with Features

In [4]:
for tickers in tqdm(df_sp500['Ticker'].unique().tolist()):
    df_sp500 = calculate_annualized_returns(tickers, df_sp500, period="5y")

df_sp500 = df_sp500.sort_values('Market_Cap', ascending=False).reset_index(drop=True)
df_sp500

 12%|█▏        | 60/503 [00:28<03:50,  1.93it/s]$BRK.B: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")
 12%|█▏        | 61/503 [00:28<04:16,  1.72it/s]

Error processing BRK.B: single positional indexer is out-of-bounds


 15%|█▍        | 74/503 [00:35<03:42,  1.93it/s]$BF.B: possibly delisted; no price data found  (period=5y)


Error processing BF.B: single positional indexer is out-of-bounds


100%|██████████| 503/503 [04:15<00:00,  1.97it/s]


Unnamed: 0,Ticker,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,Annualized_Return,YTD_Pct_Return,...,2023_Pct_Return,2022_Pct_Return,2021_Pct_Return,2020_Pct_Return,Market_Cap,Pct_Diff_200_MA,Annualized_Volatility,Sharpe_Ratio,Beta,Years_Since_Founded
0,MSFT,Microsoft,Information Technology,Systems Software,"Redmond, Washington",1994-06-01,789019,1975,21.32,10.40,...,58.35,-27.69,55.79,22.27,3.421644e+12,10.81,0.27,78.51,1.18,50.0
1,NVDA,Nvidia,Information Technology,Semiconductors,"Santa Clara, California",2001-11-30,1045810,1993,72.79,-2.29,...,246.10,-51.44,124.48,48.40,3.295497e+12,6.61,0.53,137.30,2.07,32.0
2,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30,320193,1977,20.76,-17.44,...,54.80,-28.20,38.06,65.49,2.999856e+12,-10.79,0.30,69.06,1.27,48.0
3,AMZN,Amazon,Consumer Discretionary,Broadline Retail,"Seattle, Washington",2005-11-18,1018724,1994,10.66,-6.91,...,77.04,-50.71,4.64,31.80,2.176468e+12,1.80,0.36,29.80,1.43,31.0
4,GOOG,Alphabet Inc. (Class C),Communication Services,Interactive Media & Services,"Mountain View, California",2006-04-03,1652044,1998,19.39,-9.22,...,57.11,-38.84,67.43,22.35,2.090085e+12,0.16,0.31,62.44,1.24,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,APA,APA Corporation,Energy,Oil & Gas Exploration & Production,"Houston, Texas",1997-07-28,1841666,1954,10.16,-25.26,...,-15.71,69.08,83.27,21.51,6.137973e+09,-20.98,0.55,18.30,1.39,71.0
499,CZR,Caesars Entertainment,Consumer Discretionary,Casinos & Gaming,"Reno, Nevada",2021-03-22,1590895,1973,-5.72,-17.52,...,10.93,-55.49,30.63,105.79,5.590180e+09,-22.33,0.56,-10.32,1.97,52.0
500,ENPH,Enphase Energy,Information Technology,Semiconductor Materials & Equipment,"Fremont, California",2021-01-07,1463101,2006,-7.06,-42.00,...,-47.83,43.65,6.21,194.02,5.430658e+09,-44.18,0.68,-10.33,1.48,19.0
501,BRK.B,Berkshire Hathaway,Financials,Multi-Sector Holdings,"Omaha, Nebraska",2010-02-16,1067983,1839,,,...,,,,,,,,,,


In [113]:
# save datagrame to csv
df_sp500.to_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data/sp500_data_sample.csv', index=False)

# Content-Based Recommendation System

In [111]:
# calculate cosine similarity and recommend similar stocks
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

def recommend_similar_stocks(df, min_return, ticker, founded_years, top_n=5):
    """Recommend similar stocks based on cosine similarity of financial metrics.
    
    Args:
        df (pd.DataFrame): DataFrame containing stock data.
        ticker (str): The ticker symbol of the stock to find recommendations for.
        top_n (int): Number of similar stocks to recommend.
        
    Returns:
        pd.DataFrame: DataFrame containing recommended stocks.
    """
    # Select relevant columns for similarity calculation
    features = ['Annualized_Return', 'YTD_Pct_Return', 'Pct_Diff_200_MA', 
                'Annualized_Volatility', 'Sharpe_Ratio', 'Beta', 'Market_Cap']
    
    df_selected_stock = df.copy()
    df_selected_stock['Scores'] = np.nan  # Initialize Scores column for the selected stock
    df_selected_stock = df_selected_stock[df_selected_stock['Ticker'] == ticker][['Ticker', 'Security', 'Scores', 'Annualized_Return', 'YTD_Pct_Return', 
                                                'Pct_Diff_200_MA', 'Annualized_Volatility', 
                                                'Sharpe_Ratio', 'Beta', 'Market_Cap', 'Years_Since_Founded']]

    # Filter out rows with NaN values in the features
    df_filtered = df.dropna(subset=features)
    
    # Scale the features
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df_filtered[features])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(scaled_features)
    
    # Get index of the input ticker
    try:
        ticker_index = df_filtered[df_filtered['Ticker'] == ticker].index[0]
        
        # Get similarity scores for the input ticker
        similarity_scores = list(enumerate(similarity_matrix[ticker_index]))
        
        # Sort by similarity score and get top N recommendations
        sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        
        # Get recommended tickers
        recommended_indices = [i[0] for i in sorted_scores]
        
        # add scores to the DataFrame
        df_filtered = df_filtered.reset_index(drop=True)
        df_filtered['Ticker'] = df_filtered['Ticker'].astype(str)
        df_filtered['Scores'] = np.nan  # Initialize Scores column
        df_filtered.loc[recommended_indices, 'Scores'] = [i[1] for i in sorted_scores]
        df_filtered['Scores'] = df_filtered['Scores'].astype(float)
        
        # Return the recommended stocks with relevant columns and the selected ticker and greater than min_return      
        df_filtered = df_filtered.iloc[recommended_indices][['Ticker', 'Security', 'Scores', 'Annualized_Return', 'YTD_Pct_Return', 
                                                    'Pct_Diff_200_MA', 'Annualized_Volatility', 
                                                    'Sharpe_Ratio', 'Beta', 'Market_Cap', 'Years_Since_Founded']]
        df_recommended_stocks = df_filtered[(df_filtered['Annualized_Return'] >= min_return) & (df_filtered['Ticker'] != ticker) & (df_filtered['Years_Since_Founded'] >= founded_years)].sort_values(by='Scores', ascending=False).head(top_n)
        
        return pd.concat([df_selected_stock, df_recommended_stocks], ignore_index=True)

    except IndexError:
        print(f"Ticker {ticker} not found in the DataFrame.")

In [112]:
# Example usage
ticker_to_recommend = 'NFLX'  # Example ticker
recommended_stocks = recommend_similar_stocks(df_sp500, 50, ticker_to_recommend, 1, top_n=5)
recommended_stocks

Unnamed: 0,Ticker,Security,Scores,Annualized_Return,YTD_Pct_Return,Pct_Diff_200_MA,Annualized_Volatility,Sharpe_Ratio,Beta,Market_Cap,Years_Since_Founded
0,NFLX,Netflix,,23.17,36.14,35.35,0.45,51.96,1.27,513762100000.0,28.0
1,PLTR,Palantir Technologies,0.990461,69.21,75.26,79.29,0.73,94.57,1.97,310988900000.0,22.0
2,AXON,Axon Enterprise,0.978803,52.91,25.8,37.46,0.47,112.89,1.17,58416120000.0,32.0
3,HWM,Howmet Aerospace,0.961744,66.58,53.56,42.08,0.37,181.4,1.24,68580170000.0,137.0
4,VST,Vistra Corp.,0.957373,54.06,7.48,21.56,0.47,115.87,1.16,54486540000.0,9.0
5,AVGO,Broadcom,0.936601,56.82,4.67,26.54,0.41,139.66,1.54,1138201000000.0,64.0
