In [1]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#from scipy.stats import norm
#from scipy.stats import linregress
#import statsmodels.formula.api as smf
import csv
import scipy.optimize as opt
import pandas_datareader as pdr
from datetime import datetime


In [2]:
# function to read in data from a csv
def readPricesCSV(file):
    df = pd.read_csv(file)
    dfPrices = df["Adj Close"]
    return dfPrices

In [3]:
# function to read in stock prices from yahoo finance
def readPricesYF(start_date, end_date, tickers):
    stockPrices = pdr.get_data_yahoo(tickers, start=start_date, end=end_date)   # read in the data
    stockPrices = stockPrices.filter(like='Adj Close')   # tickers are columns and data values are adjusted closing prices with the date as the index
    stockPrices.columns = tickers   # change column names to be their tickers
    stockPrices = stockPrices.stack().swaplevel().sort_index().reset_index()    # make the data tall and clean it up so it is easier to perform analysis on
    stockPrices.columns = ['Firm','Date','Adj Close']     # rename the columns
#    stockPrices['Return'] = stockPrices.groupby('Firm')['Adj Close'].pct_change()    # get the daily returns for each ticker
    return stockPrices


In [4]:
# the objective function
def obj(weights):
    
    sum_return = np.dot(weights,means)     #get the return measure given by the weights and mean
    num_stocks = len(means)
    
    #get the risk measure given by the weights
    risk_measure = np.matmul(np.matmul(np.transpose(np.array(weights)), cov_matr), np.array(weights))

    #return the objective function value
    return -(sum_return - (risk_level * risk_measure))


### To Change:
**The following block of code is the only one in this file that should be changed.**
- Add more stocks by including them in the list of tickers
- Change the starting and ending dates

**If there are any changes made, rerun the entire code after making the changes**

**Nothing else should be changed in this file**

In [5]:
#choose which stocks to include in the optimization of the portfolio (list the tickers)
tickers = ['MSFT', 'AAPL', 'AMZN', 'NFLX', 'DIS', 'TSLA']
tickers = sorted(tickers)

#set the start and end dates- choose last six months in this case
start_date = datetime(2020, 8, 25) #year, month, day
end_date = datetime(2021,2,25) #year, month, day

stockPrices = readPricesYF(start_date, end_date, tickers)    # use the function defined above to get data from Yahoo Finance


In [6]:
stockPrices

Unnamed: 0,Firm,Date,Adj Close
0,AAPL,2020-08-25,124.424088
1,AAPL,2020-08-26,126.116135
2,AAPL,2020-08-27,124.608498
3,AAPL,2020-08-28,124.406647
4,AAPL,2020-08-31,128.625549
...,...,...,...
757,TSLA,2021-02-19,781.299988
758,TSLA,2021-02-22,714.500000
759,TSLA,2021-02-23,698.840027
760,TSLA,2021-02-24,742.020020


In [7]:
# reformat the data
stock_prices_format = stockPrices.set_index(['Date', 'Firm']).unstack()   # change format so that date and each company are columns
stock_prices_format.reset_index()

Unnamed: 0_level_0,Date,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close
Firm,Unnamed: 1_level_1,AAPL,AMZN,DIS,MSFT,NFLX,TSLA
0,2020-08-25,124.424088,3346.489990,129.789993,215.408630,490.579987,404.667999
1,2020-08-26,126.116135,3441.850098,132.179993,220.065674,547.530029,430.634003
2,2020-08-27,124.608498,3400.000000,133.729996,225.469070,526.270020,447.750000
3,2020-08-28,124.406647,3401.800049,135.539993,227.787643,523.890015,442.679993
4,2020-08-31,128.625549,3450.959961,131.869995,224.424194,529.559998,498.320007
...,...,...,...,...,...,...,...
122,2021-02-19,129.869995,3249.899902,183.649994,240.970001,540.219971,781.299988
123,2021-02-22,126.000000,3180.739990,191.759995,234.509995,533.780029,714.500000
124,2021-02-23,125.860001,3194.500000,197.089996,233.270004,546.150024,698.840027
125,2021-02-24,125.349998,3159.530029,197.509995,234.550003,553.409973,742.020020


In [8]:
# get the different number of stocks and the number of stock prices 
num_stocks = len(tickers)   # the number of stocks
prices_per_stock = stock_prices_format.shape[0]   # the number of stock prices

In [9]:
# get the covariance matrix
cov_matr = np.cov(np.array(stock_prices_format.iloc[:,range(0,num_stocks)].T))   # get the columns and transpose it so it is in right format, then turn it into covariance matrix
cov_matr

array([[   78.14430733,   558.11492864,   170.96505794,    84.26230921,
          180.56013459,  1312.71030178],
       [  558.11492864, 12168.77726018,   378.16645303,   810.26625573,
         2169.09772824,  3904.51569958],
       [  170.96505794,   378.16645303,   582.86500156,   198.7905976 ,
          352.71609083,  3815.26375622],
       [   84.26230921,   810.26625573,   198.7905976 ,   136.02137507,
          251.58777063,  1426.33876124],
       [  180.56013459,  2169.09772824,   352.71609083,   251.58777063,
          764.99314083,  2771.11535027],
       [ 1312.71030178,  3904.51569958,  3815.26375622,  1426.33876124,
         2771.11535027, 30260.84334216]])

In [10]:
# get the mean price for each stock

means = np.array(stock_prices_format.mean().to_list())    # get means for each column, convert to a list, convert to a numpy array
means

array([ 122.99478972, 3207.85959223,  151.33307059,  218.25363448,
        514.73921264,  581.37560579])

In [11]:
# set the risk level
risk_level = 1   # risk loving < 0; risk neutral = 0; risk averse > 0

In [12]:
# set an intial value for the weights
weights = np.array([1/num_stocks]*num_stocks) # set an initial value for the weights, which is an even composition

### At this point, we have the following information:
 - **"tickers"** is a list of the tickers, and it is in alphabetical order
 - **"num_stocks"** is the number of different stocks
 - **"prices_per_stock"** is the different number of observations/the different number of stock prices obtained

For the Optimization:
 - **"cov_matr"** is the covariance matrix
 - **"means"** is a numpy array that consists of the means of each stock price during the time frame **(in alphabetical order of the tickers)**
 - **"risk_level"** is the risk level
 - **"weights"** is a numpy array that consists of the initial value for the weights, which is just an even composition and will be changed later **(in alphabetical order of the tickers)**

In [13]:
# perform the optimization!
lin_constr = opt.LinearConstraint([1]*num_stocks, [1], [1])    # sum of all values are greater than or equal to 1 and less than or equal to 1, so the sum has to be equal to 1
bounds = opt.Bounds([0]*num_stocks, [0.5]*num_stocks)    # each portfolio weight is greater than 0 and less than 0.5
result = opt.minimize(obj, x0=[0.25]*num_stocks, method="trust-constr", constraints = lin_constr, bounds=bounds)    # actually perform the optimization
#print(result.fun)
#print(result.x)

optimal_weights = pd.DataFrame({'stock': tickers, 'weights': result.x.tolist()})
optimal_weights
#sum(optimal_weights['weights'])

Unnamed: 0,stock,weights
0,AAPL,0.4660627
1,AMZN,0.08554199
2,DIS,2.006387e-09
3,MSFT,0.4483953
4,NFLX,1.632122e-09
5,TSLA,4.065282e-11


Next steps:
- Tweak the model to use the predicted prices instead of historical prices
- Test to see how well the portfolio works
    - Active share
        - take proportion from an existing index and tweak weights
    - Compare to performance of S&P 500 or something like that
    - Can create a graph to visualize the data
    - Calculate the information ratio
- Figure out how to add a Conditional Value at Risk measure
- Explore Nested Clustered Optimization