In [31]:
#These are the libraries you can use.  You may add any libraries directy related to threading if this is a direction
#you wish to go (this is not from the course, so it's entirely on you if you wish to use threading).  Any
#further libraries you wish to use you must email me, james@uwaterloo.ca, for permission.

from IPython.display import display, Math, Latex

import pandas as pd
import numpy as np
import numpy_financial as npf
import yfinance as yf
import matplotlib.pyplot as plt
import random
from datetime import datetime
from scipy.optimize import minimize

## Group Assignment
### Team Number: 11
### Team Member Names: Akram, Annie, Jester
### Team Strategy Chosen: Market Beat

Disclose any use of AI for this assignment below (detail where and how you used it).  Please see the course outline for acceptable uses of AI.


## **General Strategy for the Project**:

#### Initialization
0) Define and initialize necessary global variables

#### Part #1: Data Filtering and Cleaning
1) Filter out all valid US and CAD Stocks from the provided CSV file.
2) Download and store closing price, options and volume data in a dictionary using yfinance, where US stocks prices are converted using real-time exchange rate.
3) Filter out tickers within date range based on given minimum monthly average volume and minimum trading days in a month.

#### Part #2: Portfolio Construction
4) Rank stocks based on Standard Deviation of percentage change in returns in descending order.
5) Rank stocks based on PCR values using options data in descending order.
6) Score the stocks based on the two ranks, and create a new ranking based on the scoring.
7) We select stocks based on the ranking and calculate weights that would maximize the portfolio sharpe ratio while beta is within pre-defined constraints.
8) Run sharpe ratio calculation function on portfolio from size 12(min) to 24(max), so that the final portfolio (with weightings) is picked based on highest output sharpe ratio.

#### Part #3: Evaluation and Proof
9) Given the chosen portfolio of xx stocks, graph the change in portfolio standard deviation as other stocks are added.
10) Beta of portfolio compared to the S&P 500.
11) Calculate the Beta between our portfolio and an equally weighted portfolio including all valid stocks.
12) Beta between our portfolio with varied weight versus when the portfolio is equally weighted.
13) Sharpe ratio between our portfolio with varied weight versus when it is equally weighted.
14) Graphically compare sharpe ratios amongst our portfolio of xx stocks and the portfolios of varying 12-24 stocks.

#### Part #4: Final Output
15) Creating the final portfolio dataframe and CSV.

## Initializing Variables

In [32]:
def get_tickers():
    tickers = pd.read_csv('Tickers.csv')
    ticker_lst = [tickers.columns[0]] + (list(tickers[tickers.columns[0]]))
    return ticker_lst

In [33]:
# Important Constants: 
amount = 1_000_000 # Initial investment amount of $1,000,000
group = 11

# Define constants
min_avg_volume = 100000
min_trading_days = 18
start_date, end_date = '2022-09-30', '2024-09-30'
min_stocks, max_stocks = 12, 24

# Reading in CSV file: 
tickers = pd.read_csv('Tickers.csv')
ticker_lst = get_tickers()

# Initializing variable to store the tickers we will use in our portfolio
columns = ['Ticker', 'Price', 'Currency', 'Shares', 'Value', 'Weight']
Portfolio_Final = pd.DataFrame(columns=columns)
exchange_rate = yf.Ticker('CAD=X').fast_info['last_price']
print(f'The current exchange rate for the latest available day:\nUSD -> CAD: ${np.round(exchange_rate, 4)}')

The current exchange rate for the latest available day:
USD -> CAD: $1.3968


#### We must filter the tickers csv as follows:
- Must be listed on yfinance
- The currency is listed as USD or CAD 
- 100,000+ average monthly volume trades
- More than 18 trades per month
- Sufficient data

In [None]:
# Filtering valid stocks by inputting a list of strings for each ticker. 
def filter_stocks(ticker_lst):
    # Function to drop short trading months (less than 18 trading days per month)
    def drop_short_trading_months(df):
        """
        Drops months with less than 18 trading days from a yfinance history DataFrame.
        Parameters:
            df (pd.DataFrame): A yfinance DataFrame with a DatetimeIndex and stock data.
        Returns:
            pd.DataFrame: Filtered DataFrame with only months having >= 18 trading days.
        """
        # Ensure the index is a DatetimeIndex
        if not isinstance(df.index, pd.DatetimeIndex):
            raise ValueError("The DataFrame index must be a DatetimeIndex.")
        # Remove timezone information to avoid warnings
        df = df.copy()  # Avoid modifying the original DataFrame
        df.index = df.index.tz_localize(None)
        # Group by year and month
        df['YearMonth'] = df.index.to_period('M')  # Creates a 'YearMonth' period
        # Count trading days for each month
        trading_days_per_month = df.groupby('YearMonth').size()
        # Get valid months with at least 18 trading days
        valid_months = trading_days_per_month[trading_days_per_month >= 18].index
        # Filter DataFrame to include only rows in valid months
        filtered_df = df[df['YearMonth'].isin(valid_months)].drop(columns=['YearMonth'])
        return filtered_df
    
    valid_tickers, invalid_tickers = {}, []
    # Loop through all tickers to check if they are valid
    for ticker in ticker_lst:
        stock = yf.Ticker(ticker)
        try:
            info = stock.fast_info # Get basic stock info

            hist = stock.history(start=start_date, end=end_date) # Get stock history
            pd.to_datetime(hist.index, format='%Y-%m-%d')
            
            avg_volume = hist.loc[((hist.index >= start_date) & (hist.index <= end_date))]['Volume'].mean() # Calculate average volume in specified date range.
            currency = info.get("currency")
            if ((hist.empty is not None) and # filter for stocks delisted on yfinance
                ( currency == "USD" or currency == "CAD") and # filter for stocks that are not USD
                (avg_volume >= min_avg_volume)): # Filter by volume greater than 100,000
                if currency == "CAD":
                    hist = drop_short_trading_months(hist)
                    hist.index = hist.index.strftime('%Y-%m-%d')
                    valid_tickers[ticker] = hist['Close'] # Store the close prices of the stock as a Series
                elif currency == "USD":
                    hist = drop_short_trading_months(hist)
                    hist.index = hist.index.strftime('%Y-%m-%d')
                    valid_tickers[ticker] = hist['Close'] * exchange_rate # Convert USD to CAD
            else:
                invalid_tickers.append(ticker)
        except:
            invalid_tickers.append(ticker)
    return [valid_tickers, invalid_tickers]
    # valid_tickers is a dictionary of Series where the key is the name of the ticker. 
    # invalid_tickers is a list of ticker strings which were removed in the filtering process. 

In [36]:
def calculate_std(data):
    data.index = pd.to_datetime(data.index)
    
    # Calculate daily percentage returns
    returns = data.pct_change().dropna()

    # Calculate standard deviation of returns
    std = pd.DataFrame(returns.std(), columns=['Standard Deviation'])

    # Sort by standard deviation
    std_sorted = std.sort_values(by='Standard Deviation', ascending=False)

    # Add Rank column
    std_sorted['Rank'] = range(len(std_sorted))

    # Add Score column
    highest_std_value = std_sorted['Standard Deviation'].iloc[0]
    std_sorted['Score'] = (std_sorted['Standard Deviation'] / highest_std_value) * 100

    return std_sorted

In [37]:
def calculate_return(data):
    data.index = pd.to_datetime(data.index)
    
    # Calculate daily percentage returns
    returns = data.pct_change().dropna()

    # Calculate standard deviation of returns
    ret = pd.DataFrame(returns.mean(), columns=['Return'])

    # Sort by standard deviation
    ret_sorted = ret.sort_values(by='Return', ascending=False)

    # Add Rank column
    ret_sorted['Rank'] = range(len(ret_sorted))

    # Add Score column
    highest_ret_value = ret_sorted['Return'].iloc[0]
    ret_sorted['Score'] = (ret_sorted['Return'] / highest_ret_value) * 100

    return ret_sorted

In [38]:
# Loading data into variables
stock_filter = filter_stocks(ticker_lst)
ticker_data = stock_filter[0]
ticker_lst = list(ticker_data.keys()) # Reassign original ticker list
data = pd.DataFrame()
for ticker in ticker_data:
    data[ticker] = ticker_data[ticker]

# returns = data.pct_change()
# returns.drop(index=returns.index[0], inplace = True)

data.head()

$AGN: possibly delisted; no timezone found
$AGN: possibly delisted; no price data found  (period=5d) (Yahoo error = "No data found, symbol may be delisted")
$CELG: possibly delisted; no timezone found
$CELG: possibly delisted; no price data found  (period=5d) (Yahoo error = "No data found, symbol may be delisted")
$MON: possibly delisted; no timezone found
$MON: possibly delisted; no price data found  (period=5d) (Yahoo error = "No data found, symbol may be delisted")
$RTN: possibly delisted; no timezone found
$RTN: possibly delisted; no price data found  (period=5d) (Yahoo error = "No data found, symbol may be delisted")


Unnamed: 0_level_0,AAPL,ABBV,ABT,ACN,AIG,AMZN,AXP,BA,BAC,BB.TO,...,QCOM,RY.TO,SHOP.TO,T.TO,TD.TO,TXN,UNH,UNP,UPS,USB
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-03,196.57744,177.349742,132.989213,356.862738,65.658218,161.861185,189.724837,176.066649,41.05849,6.53,...,156.204183,114.575363,37.799999,25.033072,77.62326,208.81014,698.97314,266.829708,207.073326,52.522376
2022-10-04,201.614347,182.05531,137.746488,369.553315,69.03593,169.138512,197.095975,186.486766,42.762114,6.74,...,163.066931,117.543633,42.610001,25.316832,79.079201,215.746994,709.359197,273.651705,214.437473,53.861661
2022-10-05,202.028362,183.77339,138.147384,369.593811,68.488557,168.942961,195.619015,184.531254,42.154618,6.67,...,166.425008,117.041306,41.959999,25.104013,78.620857,219.208857,714.647242,267.30939,213.218577,53.31837
2022-10-06,200.689769,179.875581,136.904606,363.032841,67.767618,168.035049,192.922605,184.656961,41.547123,6.64,...,166.944694,113.762505,41.32,24.421213,75.751671,217.562796,703.881506,264.09828,210.260266,51.903283
2022-10-07,193.320698,177.913867,136.022654,349.88413,66.365809,160.017409,188.342786,181.290668,40.609474,6.29,...,161.121362,110.520241,37.349998,24.376871,74.671204,208.078576,684.519448,260.647308,202.058051,50.791433


In [39]:
# Function to get the total volume for a call or put of a given stock.
# ticker: yfinance Ticker class
# put: Boolean for if you want to calculate put volume. Else, put False for call volume. 
def get_options_vol(ticker, put):
    exps = ticker.options # Expiration dates of available options
    optdata = pd.DataFrame() # Data storage
    for exp in exps:
        chain = pd.DataFrame()
        if put: chain = ticker.option_chain(exp).puts['volume'] # Gets the desired columns
        else: chain = ticker.option_chain(exp).calls['volume'] # If put options are desired then use this data.
        optdata = pd.concat([optdata, chain]) # Add the calls/puts to the main dataframe. 
    return optdata.sum()['volume'] # output total volue of put/call options

# Function to calculate the PCR for each stock. 
def PCR_calc(tickers):
    pcrdata = pd.DataFrame(columns=['Ticker', 'Put Volume', 'Call Volume', 'PCR'])
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        try: 
            # Get the volume for Put and Call options:
            call_options = get_options_vol(stock, False)
            put_options = get_options_vol(stock, True)
            # Calculate PCR Ratio:
            pcr = call_options / put_options # Order reversed from the formula for sake of ranking
            #print(f"Ticker: {ticker}, PCR: {pcr}")  # Debugging
            pcrdata.loc[len(pcrdata)] = [ticker, put_options, call_options, pcr]
        except Exception as e:
            print(f"Options Data Not Found {ticker}: {e} not found")  # Debugging (output error)
            pass
    return pcrdata

In [40]:
std = calculate_std(data)
std

  returns = data.pct_change().dropna()


Unnamed: 0,Standard Deviation,Rank,Score
SHOP.TO,0.036151,0,100.0
BB.TO,0.034589,1,95.681598
PYPL,0.023819,2,65.888938
QCOM,0.023439,3,64.837268
AMZN,0.021313,4,58.955341
USB,0.021239,5,58.752259
BA,0.020323,6,56.217049
LLY,0.018084,7,50.025324
CAT,0.017586,8,48.645457
TXN,0.016667,9,46.104356


In [41]:
ret = calculate_return(data)
ret

  returns = data.pct_change().dropna()


Unnamed: 0,Return,Rank,Score
SHOP.TO,0.002726,0,100.0
LLY,0.00221,1,81.071882
CAT,0.001889,2,69.286142
AXP,0.001511,3,55.441218
BK,0.001427,4,52.355034
BLK,0.001227,5,45.024356
AMZN,0.001196,6,43.853769
QCOM,0.001116,7,40.925522
AAPL,0.001086,8,39.832615
C,0.001055,9,38.682412


In [42]:
# Load the PCR values for each of the valid stocks into a variable
options_data = PCR_calc(ticker_lst)
options_data = options_data.sort_values(by='PCR', ascending=False)
options_data['Rank'] = [i for i in range(len(options_data))]
highest_pcr = options_data['PCR'].iloc[0]
options_data['Score'] = (options_data['PCR'] / highest_pcr) * 100
options_data.set_index('Ticker', inplace=True)


# Display the table of rankings based off PCR. 
# The rankings are based off the stocks with the greatest sentiment for if they will go up or not
# The tickers at the top of the list have a high call rate (meaning the price will go up)
pcr = options_data
pcr

Options Data Not Found BB.TO: 'volume' not found
Options Data Not Found RY.TO: 'volume' not found
Options Data Not Found SHOP.TO: 'volume' not found
Options Data Not Found T.TO: 'volume' not found
Options Data Not Found TD.TO: 'volume' not found


Unnamed: 0_level_0,Put Volume,Call Volume,PCR,Rank,Score
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PEP,5763.0,35932.0,6.234947,0,100.0
KO,13171.0,50030.0,3.798497,1,60.922677
PM,3092.0,8275.0,2.676261,2,42.923561
PYPL,12469.0,31317.0,2.511589,3,40.282439
PG,7077.0,16438.0,2.322736,4,37.253494
BA,27272.0,57991.0,2.126393,5,34.104433
AMZN,163766.0,339482.0,2.07297,6,33.247595
LMT,4332.0,8372.0,1.932595,7,30.996168
MRK,9362.0,18016.0,1.924375,8,30.864338
LLY,30693.0,51657.0,1.683022,9,26.993368


In [43]:
def calculate_scoreboard(std, pcr, ret):
    """
    Merges three DataFrames (std, pcr, ret) on their index (assumed to be ticker names),
    calculates the average of their 'Score' columns, and sorts the result by 'Average Score'.
    Handles NaN values by taking the value from 'ret' where available.
    """
    # Merge std and pcr DataFrames
    merged = std[['Score']].merge(
        pcr[['Score']], left_index=True, right_index=True, suffixes=('_std', '_pcr'), how='outer'
    )
    
    # Merge the resulting DataFrame with ret
    merged = merged.merge(
        ret[['Score']].rename(columns={'Score': 'Score_ret'}),  # Rename the Score column in ret
        left_index=True,
        right_index=True,
        how='outer'
    )

    # Fill missing values: take the value from 'ret' where other columns are NaN
    merged['Score_std'] = merged['Score_std'].fillna(merged['Score_ret'])
    merged['Score_pcr'] = merged['Score_pcr'].fillna(merged['Score_ret'])
    merged['Score_ret'] = merged['Score_ret'].fillna(merged[['Score_std', 'Score_pcr']].mean(axis=1))

    # Calculate the average score from all three columns
    merged['Average Score'] = merged[['Score_std', 'Score_pcr', 'Score_ret']].mean(axis=1)

    # Sort the DataFrame by 'Average Score' in descending order
    merged_sorted = merged.sort_values(by='Average Score', ascending=False)

    return merged_sorted



calculate_scoreboard(std, pcr, ret)

Unnamed: 0,Score_std,Score_pcr,Score_ret,Average Score
SHOP.TO,100.0,100.0,100.0,100.0
LLY,50.025324,26.993368,81.071882,52.696858
PEP,28.192909,100.0,8.216863,45.469924
AMZN,58.955341,33.247595,43.853769,45.352235
CAT,48.645457,13.300584,69.286142,43.744061
QCOM,64.837268,23.446427,40.925522,43.069739
AXP,44.979889,17.848506,55.441218,39.423204
BK,40.104736,22.459591,52.355034,38.306454
BA,56.217049,34.104433,23.427179,37.91622
AAPL,43.508005,25.645929,39.832615,36.32885


In [44]:
def calculate_scoreboard_2(std, pcr, ret):
    """
    Merges three DataFrames (std, pcr, ret) on their index (assumed to be ticker names),
    calculates the average of 'Score_std' and 'Score_pcr', using 'Score_ret' only when 'Score_pcr' is NaN.
    The average score is strictly based on 'Score_std' and 'Score_pcr'.
    """
    # Merge std and pcr DataFrames
    merged = std[['Score']].merge(
        pcr[['Score']], left_index=True, right_index=True, suffixes=('_std', '_pcr'), how='outer'
    )
    
    # Merge the resulting DataFrame with ret
    merged = merged.merge(
        ret[['Score']].rename(columns={'Score': 'Score_ret'}),  # Rename the Score column in ret
        left_index=True,
        right_index=True,
        how='outer'
    )

    # Use 'Score_ret' where 'Score_pcr' is NaN
    merged['Score_pcr'] = merged['Score_pcr'].fillna(merged['Score_ret'])
    merged.drop(columns=['Score_ret'], inplace=True)

    # Calculate the average score using only 'Score_std' and 'Score_pcr'
    # Exclude rows where both 'Score_std' and 'Score_pcr' are NaN
    merged['Average Score'] = merged[['Score_std', 'Score_pcr']].mean(axis=1)

    # Sort the DataFrame by 'Average Score' in descending order
    merged_sorted = merged.sort_values(by='Average Score', ascending=False)

    
    return merged_sorted




calculate_scoreboard_2(std, pcr, ret)

Unnamed: 0,Score_std,Score_pcr,Average Score
SHOP.TO,100.0,100.0,100.0
PEP,28.192909,100.0,64.096455
PYPL,65.888938,40.282439,53.085688
AMZN,58.955341,33.247595,46.101468
BA,56.217049,34.104433,45.160741
QCOM,64.837268,23.446427,44.141847
KO,24.000748,60.922677,42.461713
LLY,50.025324,26.993368,38.509346
PM,29.40638,42.923561,36.164971
UPS,45.400913,26.784566,36.092739


In [45]:
# Define to get call and put option data (specifically the total volume)
# def get_options_vol(ticker, put):
#     exps = ticker.options # Expiration dates of available options
#     data = pd.DataFrame() # Data storage
#     for exp in exps:
#         chain = pd.DataFrame()
#         if put: chain = ticker.option_chain(exp).puts['volume'] # Gets the desired columns
#         else: chain = ticker.option_chain(exp).calls['volume'] # If put options are desired then use this data.
#         data = pd.concat([data, chain]) # Add the calls/puts to the main dataframe. 
#     return data.sum()['volume'] # output total volue of put/call options

#cols = ['lastTradeDate','strike', 'bid', 'ask', 'volume', 'inTheMoney', 'currency']
# chain = chain.set_index('lastTradeDate') # Reset the index to the expiration dates
# chain.index = chain.index.strftime('%Y-%m-%d') # Remove excess data
# chain = chain.rename_axis('Expirations') # Rename the index 

In [None]:
# Load market data into a dataframe
s_p500 = yf.Ticker('^GSPC').history(start=start_date, end=end_date)['Close']
tsx60 = yf.Ticker('^GSPTSE').history(start=start_date, end=end_date)['Close']

SPreturns = s_p500.pct_change().dropna()
TSX60Returns = tsx60.pct_change().dropna()

avg_return = (SPreturns + TSX60Returns)/2

market_indices = pd.DataFrame({'S&P 500 PCT Returns': SPreturns, 
                               'TSX 60 PCT Returns': TSX60Returns, 
                               'Average Market Return': avg_return})
market_indices.index = market_indices.index.strftime('%Y-%m-%d')

market_indices.head()

Unnamed: 0_level_0,S&P 500 PCT Returns,TSX 60 PCT Returns,Average Market Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-10-03,0.025884,0.023693,0.024788
2022-10-04,0.030584,0.025941,0.028262
2022-10-05,-0.002018,-0.007016,-0.004517
2022-10-06,-0.010245,-0.013314,-0.01178
2022-10-07,-0.028004,-0.02086,-0.024432


In [47]:
# sharpe ratio optimization
def optimal_sharpe(tickers, risk_free_rate, investment, data):
    # download data
    data = data 

    def neg_sharpe(weights):

        # determining number of shares of each stock that can be bought
        shares = []
        for i in range(len(tickers)):
            allocation = investment * weights[i]  # investment allocated to this stock
            price_per_share = data.iloc[0][tickers[i].upper()]
            
            # clculate fees
            flat_fee = 3.95
            per_share_fee = allocation/price_per_share/(1000+1/price_per_share) 
            
            # choose the smaller of the two fees
            trading_fee = min(flat_fee, per_share_fee)
            
            # calculate the number of shares after deducting the fee
            effective_investment = allocation - trading_fee
            shares.append(effective_investment / price_per_share)
        
        # forming the portfolio
        portfolio = data*shares
        portfolio['total'] = portfolio.sum(axis=1)
        portfolio['daily return'] = portfolio['total'].pct_change(1)

        # calculating sharpe ratio
        er = portfolio['daily return'].mean()
        std = portfolio['daily return'].std()
        sharpe_ratio = (er-risk_free_rate)/std
        
        sharpe_ratio = sharpe_ratio*(252**0.5) # annualizing sharpe ratio by trading days

        return -sharpe_ratio #make sharpe ratio negative for minimize function

    # constraints
    def check_sum(weights): 
        return np.sum(weights)-1 #returns 0 if weights sum up to 1
    constraints = {'type': 'eq', 'fun': check_sum}

    min_weight = 1/(2*len(tickers))
    max_weight = 0.4

    bounds = [(min_weight, max_weight)]*len(tickers)

    # initial guess
    init_guess = [1.0/len(tickers)]*len(tickers)

    results = minimize(neg_sharpe, init_guess, method="SLSQP", bounds=bounds, constraints=constraints)

    return results

In [48]:
# need to edit ticker_lst to get the actual stocks we will be using
optimal = optimal_sharpe(ticker_lst, 0, amount, data)
print(optimal.x)

[0.01351351 0.01351351 0.01351351 0.01351351 0.01351351 0.01351351
 0.01351351 0.01351351 0.01351351 0.01351351 0.01351351 0.01351351
 0.01351351 0.01351351 0.01351351 0.20791637 0.01351351 0.01351351
 0.31911067 0.01351351 0.01351351 0.01351351 0.01351351 0.01351351
 0.01351351 0.01351351 0.01351351 0.01351351 0.01351351 0.01351351
 0.01351351 0.01351351 0.01351351 0.01351351 0.01351351 0.01351351
 0.01351351]


In [49]:
# Code to output final dataframe to a CSV file called Stocks_Group_XX.csv
Stocks_Final = Portfolio_Final[['Ticker', 'Shares']]
Stocks_Final.to_csv(f'Stocks_Group_{group}.csv', index=False)

In [50]:
Portfolio_Final

Unnamed: 0,Ticker,Price,Currency,Shares,Value,Weight


## Contribution Declaration

The following team members made a meaningful contribution to this assignment:

---
<p style="color: #004dd3">
Akram Jamil
</p>

<p style="color: #2C8CA9">
Jester Yang
</p>

<p style="color: #3cc19d;">
Annie Wong
</p>

---