In [1]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#from scipy.stats import norm
#from scipy.stats import linregress
#import statsmodels.formula.api as smf
import csv
import scipy.optimize as opt
import pandas_datareader as pdr
from datetime import datetime


In [2]:
# function to read in data from a csv
def readPricesCSV(file):
    df = pd.read_csv(file)
    dfPrices = df["Adj Close"]
    return dfPrices

In [3]:
# function to read in stock prices from yahoo finance
def readPricesYF(start_date, end_date, tickers):
    stockPrices = pdr.get_data_yahoo(tickers, start=start_date, end=end_date)   # read in the data
    stockPrices = stockPrices.filter(like='Adj Close')   # tickers are columns and data values are adjusted closing prices with the date as the index
    stockPrices.columns = tickers   # change column names to be their tickers
    stockPrices = stockPrices.stack().swaplevel().sort_index().reset_index()    # make the data tall and clean it up so it is easier to perform analysis on
    stockPrices.columns = ['Firm','Date','Adj Close']     # rename the columns
#    stockPrices['Return'] = stockPrices.groupby('Firm')['Adj Close'].pct_change()    # get the daily returns for each ticker
    return stockPrices


In [4]:
# the objective function
def obj(weights):
    
    sum_return = np.dot(weights,means)     #get the return measure given by the weights and mean
    num_stocks = len(means)
    
    #get the risk measure given by the weights
    risk_measure = np.matmul(np.matmul(np.transpose(np.array(weights)), cov_matr), np.array(weights))
#    print("risk measure: ", risk_measure)
    
    #return the objective function value
    return -(sum_return - (risk_level * risk_measure))


### To Change:
**The following block of code is the only one in this file that should be changed.**
- Add more stocks by including them in the list of tickers
- Change the starting and ending dates

**If there are any changes made, rerun the entire code after making the changes**

**Nothing else should be changed in this file**

In [5]:
#choose which stocks to include in the optimization of the portfolio (list the tickers)
tickers = ['MSFT', 'AAPL', 'AMZN', 'NFLX', 'DIS', 'TSLA']
tickers = sorted(tickers)

#set the start and end dates- choose last six months in this case
start_date = datetime(2020, 8, 25) #year, month, day
end_date = datetime(2021,2,25) #year, month, day

stockPrices = readPricesYF(start_date, end_date, tickers)    # use the function defined above to get data from Yahoo Finance


In [6]:
stockPrices

Unnamed: 0,Firm,Date,Adj Close
0,AAPL,2020-08-25,124.424088
1,AAPL,2020-08-26,126.116135
2,AAPL,2020-08-27,124.608498
3,AAPL,2020-08-28,124.406647
4,AAPL,2020-08-31,128.625549
...,...,...,...
757,TSLA,2021-02-19,781.299988
758,TSLA,2021-02-22,714.500000
759,TSLA,2021-02-23,698.840027
760,TSLA,2021-02-24,742.020020


In [7]:
# reformat the data
stock_prices_format = stockPrices.set_index(['Date', 'Firm']).unstack()   # change format so that date and each company are columns
stock_prices_pct_change = stock_prices_format.pct_change()[1:]
stock_prices_pct_change

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close
Firm,AAPL,AMZN,DIS,MSFT,NFLX,TSLA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2020-08-26,0.013599,0.028496,0.018414,0.021620,0.116087,0.064166
2020-08-27,-0.011954,-0.012159,0.011726,0.024554,-0.038829,0.039746
2020-08-28,-0.001620,0.000529,0.013535,0.010283,-0.004522,-0.011323
2020-08-31,0.033912,0.014451,-0.027077,-0.014766,0.010823,0.125689
2020-09-01,0.039833,0.013956,0.012740,0.007715,0.050967,-0.046697
...,...,...,...,...,...,...
2021-02-19,0.001233,-0.023535,0.003552,-0.011567,-0.014593,-0.007722
2021-02-22,-0.029799,-0.021281,0.044160,-0.026808,-0.011921,-0.085499
2021-02-23,-0.001111,0.004326,0.027795,-0.005288,0.023174,-0.021917
2021-02-24,-0.004052,-0.010947,0.002131,0.005487,0.013293,0.061788


In [8]:
# get the different number of stocks and the number of stock prices 
num_stocks = len(tickers)   # the number of stocks
prices_per_stock = stock_prices_format.shape[0]   # the number of stock prices

In [9]:
# get the covariance matrix
cov_matr = np.cov(np.array(stock_prices_pct_change.iloc[:,range(0,num_stocks)].T))   # get the columns and transpose it so it is in right format, then turn it into covariance matrix
#corr_matr = stock_prices_pct_change.corr()
cov_matr

array([[ 5.59896149e-04,  3.73559042e-04,  4.32429351e-05,
         3.05618489e-04,  3.38402153e-04,  5.54343209e-04],
       [ 3.73559042e-04,  4.41159509e-04,  5.04481211e-05,
         3.06976679e-04,  3.98375581e-04,  4.11422252e-04],
       [ 4.32429351e-05,  5.04481211e-05,  5.77167770e-04,
         8.61969935e-05,  1.64317959e-05, -3.94774403e-05],
       [ 3.05618489e-04,  3.06976679e-04,  8.61969935e-05,
         3.32178259e-04,  2.66808739e-04,  3.76651700e-04],
       [ 3.38402153e-04,  3.98375581e-04,  1.64317959e-05,
         2.66808739e-04,  9.12232700e-04,  4.15956589e-04],
       [ 5.54343209e-04,  4.11422252e-04, -3.94774403e-05,
         3.76651700e-04,  4.15956589e-04,  2.22980118e-03]])

In [10]:
# get the mean price for each stock

means = np.array(stock_prices_pct_change.mean().to_list())    # get means for each column, convert to a list, convert to a numpy array
means

array([ 5.71133647e-05, -4.98897312e-04,  3.34623847e-03,  6.51091710e-04,
        1.30126183e-03,  5.28170546e-03])

In [11]:
# set the risk level
risk_level = 1   # risk loving < 0; risk neutral = 0; risk averse > 0

In [12]:
# set an intial value for the weights
weights = np.array([1/num_stocks]*num_stocks) # set an initial value for the weights, which is an even composition

### At this point, we have the following information:
 - **"tickers"** is a list of the tickers, and it is in alphabetical order
 - **"num_stocks"** is the number of different stocks
 - **"prices_per_stock"** is the different number of observations/the different number of stock prices obtained

For the Optimization:
 - **"cov_matr"** is the covariance matrix
 - **"means"** is a numpy array that consists of the means of each stock price during the time frame **(in alphabetical order of the tickers)**
 - **"risk_level"** is the risk level
 - **"weights"** is a numpy array that consists of the initial value for the weights, which is just an even composition and will be changed later **(in alphabetical order of the tickers)**

In [13]:
# perform the optimization!
lin_constr = opt.LinearConstraint([1]*num_stocks, [1], [1])    # sum of all values are greater than or equal to 1 and less than or equal to 1, so the sum has to be equal to 1
bounds = opt.Bounds([0]*num_stocks, [0.5]*num_stocks)    # each portfolio weight is greater than 0 and less than 0.5
result = opt.minimize(obj, x0=[1/num_stocks]*num_stocks, method="trust-constr", constraints = lin_constr, bounds=bounds)    # actually perform the optimization
print(result.fun)
#print(result.x)

optimal_weights = pd.DataFrame({'stock': tickers, 'weights': result.x.tolist()})
optimal_weights
#sum(optimal_weights['weights'])
np.dot(optimal_weights["weights"],means) - (np.matmul(np.matmul(np.transpose(np.array(optimal_weights["weights"])), cov_matr), np.array(optimal_weights["weights"])))
optimal_weights


-0.0035937398352968823


Unnamed: 0,stock,weights
0,AAPL,0.00265
1,AMZN,0.002229
2,DIS,0.491988
3,MSFT,0.003388
4,NFLX,0.005351
5,TSLA,0.494393


Next steps:
- Tweak the model to use the predicted prices instead of historical prices
- Test to see how well the portfolio works
    - Active share
        - take proportion from an existing index and tweak weights
    - Compare to performance of S&P 500 or something like that
    - Can create a graph to visualize the data
    - Calculate the information ratio
- Figure out how to add a Conditional Value at Risk measure
- Explore Nested Clustered Optimization
- Could incorporate elements from stochastic calculus about estimating stock prices