# Data Collection and Pre-processing

In [None]:
!pip install yfinance



In [None]:
import yfinance as yf
import pandas as pd
import statistics
import math

In [None]:
def getStockData(stockName, startDate='2022-03-1', endDate='2022-03-31'):
  stockData = yf.download(stockName, startDate, endDate)
  stockData['Date'] = stockData.index
  stockData = stockData.reset_index(drop=True)
  columns = list(stockData.columns)
  columns = columns[-1:] + columns[:-1]
  stockData = stockData[columns]
  return stockData

In [None]:
stocks = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX']
stockIndex = {str(k):i for k,i in zip(stocks, range(len(stocks)))}

In [None]:
data = {stockName: getStockData(stockName) for stockName in stocks}

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [None]:
len(data)

5

In [None]:
data.keys()

dict_keys(['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX'])

In [None]:
data['AAPL'].dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
dtype: object

In [None]:
data['GOOG']

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2022-03-01,2689.600098,2722.219971,2667.570068,2683.360107,2683.360107,1232000
1,2022-03-02,2692.165039,2712.310059,2668.649902,2695.030029,2695.030029,1198300
2,2022-03-03,2719.570068,2734.275879,2668.620117,2686.159912,2686.159912,989000
3,2022-03-04,2667.649902,2683.97998,2608.169922,2642.439941,2642.439941,1222300
4,2022-03-07,2638.080078,2638.080078,2528.199951,2529.290039,2529.290039,1958900
5,2022-03-08,2525.01001,2624.929932,2517.215088,2545.570068,2545.570068,1762500
6,2022-03-09,2628.0,2683.969971,2601.76001,2677.320068,2677.320068,1612900
7,2022-03-10,2629.25,2670.77002,2628.02002,2653.639893,2653.639893,1213300
8,2022-03-11,2679.98999,2684.0,2605.929932,2609.51001,2609.51001,1330000
9,2022-03-14,2611.459961,2620.52002,2528.26001,2534.820068,2534.820068,1512700


**Calculating OHCL Trading Indicator for each of the stocks on the selected dates**

In [None]:
def calculateOHCLIndicators(stockData):
  ohclValues = [None]*len(stockData)
  for i in range(len(stockData)):
    ohclValues[i] = statistics.mean(list(stockData.iloc[i][1:-2]))
  stockData['OHCLIndicator'] = ohclValues

  return stockData

In [None]:
for stockName in list(data.keys()):
  data[stockName] = calculateOHCLIndicators(data[stockName])

In [None]:
data['GOOG']

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,OHCLIndicator
0,2022-03-01,2689.600098,2722.219971,2667.570068,2683.360107,2683.360107,1232000,2690.687561
1,2022-03-02,2692.165039,2712.310059,2668.649902,2695.030029,2695.030029,1198300,2692.038757
2,2022-03-03,2719.570068,2734.275879,2668.620117,2686.159912,2686.159912,989000,2702.156494
3,2022-03-04,2667.649902,2683.97998,2608.169922,2642.439941,2642.439941,1222300,2650.559937
4,2022-03-07,2638.080078,2638.080078,2528.199951,2529.290039,2529.290039,1958900,2583.412537
5,2022-03-08,2525.01001,2624.929932,2517.215088,2545.570068,2545.570068,1762500,2553.181274
6,2022-03-09,2628.0,2683.969971,2601.76001,2677.320068,2677.320068,1612900,2647.762512
7,2022-03-10,2629.25,2670.77002,2628.02002,2653.639893,2653.639893,1213300,2645.419983
8,2022-03-11,2679.98999,2684.0,2605.929932,2609.51001,2609.51001,1330000,2644.857483
9,2022-03-14,2611.459961,2620.52002,2528.26001,2534.820068,2534.820068,1512700,2573.765015


In [None]:
data['GOOG'].iloc[0]

Date             2022-03-01 00:00:00
Open                     2689.600098
High                     2722.219971
Low                      2667.570068
Close                    2683.360107
Adj Close                2683.360107
Volume                       1232000
OHCLIndicator            2690.687561
Name: 0, dtype: object

# Setting the environment for RL

In [None]:
from itertools import permutations, combinations, chain, islice
import numpy as np, numpy.random
import datetime

**STATES**

In [None]:
states = pd.DataFrame(columns = ['Date']+stocks)
states['Date'] = data['AAPL']['Date']
for stock in stocks:
  states[stock] = [data[stock].iloc[i]['OHCLIndicator'] for i in range(len(states))]

In [None]:
states

Unnamed: 0,Date,AAPL,AMZN,FB,GOOG,NFLX
0,2022-03-01,164.1175,3039.752502,206.8325,2690.687561,389.137497
1,2022-03-02,165.314999,3023.255005,206.014999,2692.038757,383.347496
2,2022-03-03,167.290001,3011.075012,205.622501,2702.156494,376.209999
3,2022-03-04,163.827503,2922.284973,201.777496,2650.559937,365.430008
4,2022-03-07,161.68,2832.037537,194.107498,2583.412537,355.717506
5,2022-03-08,158.735004,2734.849976,190.237499,2553.181274,347.772499
6,2022-03-09,161.8125,2779.285034,196.815002,2647.762512,357.782509
7,2022-03-10,158.772499,2925.775024,194.350002,2645.419983,358.322495
8,2022-03-11,156.859997,2950.714966,190.119999,2644.857483,351.217499
9,2022-03-14,151.572498,2880.840027,187.942501,2573.765015,335.217506


In [None]:
states.loc[states['Date'] == datetime.datetime(2022,4,20)]

Unnamed: 0,Date,AAPL,AMZN,FB,GOOG,NFLX


**ACTIONS**

In [None]:
# we have 5 stocks, let's say the optimal portfolio has 2 stocks 
chosenStocks = 2
stockCombinations = list(combinations(stocks, chosenStocks))
print(stockCombinations)
print(len(stockCombinations))

[('AAPL', 'AMZN'), ('AAPL', 'FB'), ('AAPL', 'GOOG'), ('AAPL', 'NFLX'), ('AMZN', 'FB'), ('AMZN', 'GOOG'), ('AMZN', 'NFLX'), ('FB', 'GOOG'), ('FB', 'NFLX'), ('GOOG', 'NFLX')]
10


In [None]:
# acions indicate the weights of the portfolio 
# let's assume that the weights will be only in multiples of 10
weights = [10, 20, 30, 40, 50, 50, 60, 70, 80, 90]
weightCombinations = [pair for pair in combinations(weights, 2) if sum(pair) == 100]
weightCombinations = [list(permutations(action)) for action in weightCombinations]
weightCombinations = list(set(chain.from_iterable(weightCombinations)))
weightCombinations

[(10, 90),
 (70, 30),
 (60, 40),
 (90, 10),
 (40, 60),
 (20, 80),
 (50, 50),
 (80, 20),
 (30, 70)]

In [None]:
actions = []
for stockCombination in stockCombinations:
  for weighCombination in weightCombinations:
    stockIndices = [stockIndex[stock] for stock in stockCombination]
    currentAction = [None]*len(stocks)
    for weight, _stockIndex in zip(weighCombination, stockIndices):
      currentAction[_stockIndex] = weight
    currentAction = [0 if currentAction[i] is None else currentAction[i] for i in range(len(currentAction))]
    actions.append(tuple(currentAction))

In [None]:
actions[0]

(10, 90, 0, 0, 0)

In [None]:
len(actions)

90

In [None]:
len(set(actions))

90

**POLICY**

The policy is defined as a random policy obtained as softmax probabilities from a simple softmax regression neural network

* X = (5 * 1)
* W = (90 * 5)
* Z = W.X (90 * 1)
* find softmax for Z -> softmax probabilities

In [None]:
from scipy.special import softmax

In [None]:
X = states.iloc[:,1:]

In [None]:
X 

Unnamed: 0,AAPL,AMZN,FB,GOOG,NFLX
0,164.1175,3039.752502,206.8325,2690.687561,389.137497
1,165.314999,3023.255005,206.014999,2692.038757,383.347496
2,167.290001,3011.075012,205.622501,2702.156494,376.209999
3,163.827503,2922.284973,201.777496,2650.559937,365.430008
4,161.68,2832.037537,194.107498,2583.412537,355.717506
5,158.735004,2734.849976,190.237499,2553.181274,347.772499
6,161.8125,2779.285034,196.815002,2647.762512,357.782509
7,158.772499,2925.775024,194.350002,2645.419983,358.322495
8,156.859997,2950.714966,190.119999,2644.857483,351.217499
9,151.572498,2880.840027,187.942501,2573.765015,335.217506


In [None]:
W = np.random.random((90, 5))
W = W.astype('float64')

In [None]:
W

array([[0.32040276, 0.56450683, 0.46320208, 0.47873622, 0.91353954],
       [0.57122499, 0.77004924, 0.78150294, 0.90963114, 0.21898696],
       [0.00360874, 0.82724772, 0.76847606, 0.17238541, 0.12068675],
       [0.15380075, 0.34863805, 0.34714828, 0.06245097, 0.28158094],
       [0.1796662 , 0.99624467, 0.08468278, 0.63944699, 0.95951802],
       [0.06611688, 0.02925671, 0.7245836 , 0.4859784 , 0.89489615],
       [0.78223319, 0.50013173, 0.27607361, 0.64692022, 0.8315185 ],
       [0.8649462 , 0.37839479, 0.74858601, 0.46174524, 0.9700434 ],
       [0.44367092, 0.67783982, 0.71437525, 0.16189568, 0.88293214],
       [0.3475844 , 0.66236905, 0.82660094, 0.55900158, 0.22086086],
       [0.7269196 , 0.68115967, 0.65158099, 0.95443748, 0.92460808],
       [0.97815977, 0.77546859, 0.23015189, 0.50947556, 0.56015715],
       [0.72741273, 0.99672089, 0.87530132, 0.61739595, 0.29609118],
       [0.22082748, 0.04388312, 0.75681298, 0.89003651, 0.75981679],
       [0.52963533, 0.94896031, 0.

In [None]:
statePolicy = {}
for i in range(len(X)):
  x = X.loc[i]
  Z = np.dot(W, x)
  # print(Z)
  currentState = tuple(states.loc[i].values.flatten().tolist()[1:])
  softmaxValues = softmax(Z)
  softmaxValue = np.round(softmaxValues, 4)
  # print(softmaxValues)
  statePolicy[currentState] = {action:prob for action,prob in zip(actions, softmaxValues)}

In [None]:
dict(islice(statePolicy.items(), 1))

{(164.11750030517578,
  3039.7525024414062,
  206.83250045776367,
  2690.6875610351562,
  389.1374969482422): {(0, 0, 0, 10, 90): 0.0,
  (0, 0, 0, 20, 80): 0.0,
  (0, 0, 0, 30, 70): 0.0,
  (0, 0, 0, 40, 60): 0.0,
  (0, 0, 0, 50, 50): 6.623681214994364e-212,
  (0, 0, 0, 60, 40): 0.0,
  (0, 0, 0, 70, 30): 0.0,
  (0, 0, 0, 80, 20): 0.0,
  (0, 0, 0, 90, 10): 0.0,
  (0, 0, 10, 0, 90): 0.0,
  (0, 0, 10, 90, 0): 0.0,
  (0, 0, 20, 0, 80): 0.0,
  (0, 0, 20, 80, 0): 0.0,
  (0, 0, 30, 0, 70): 0.0,
  (0, 0, 30, 70, 0): 0.0,
  (0, 0, 40, 0, 60): 0.0,
  (0, 0, 40, 60, 0): 0.0,
  (0, 0, 50, 0, 50): 0.0,
  (0, 0, 50, 50, 0): 0.0,
  (0, 0, 60, 0, 40): 0.0,
  (0, 0, 60, 40, 0): 0.0,
  (0, 0, 70, 0, 30): 0.0,
  (0, 0, 70, 30, 0): 0.0,
  (0, 0, 80, 0, 20): 0.0,
  (0, 0, 80, 20, 0): 0.0,
  (0, 0, 90, 0, 10): 0.0,
  (0, 0, 90, 10, 0): 0.0,
  (0, 10, 0, 0, 90): 0.0,
  (0, 10, 0, 90, 0): 0.0,
  (0, 10, 90, 0, 0): 0.0,
  (0, 20, 0, 0, 80): 0.0,
  (0, 20, 0, 80, 0): 0.0,
  (0, 20, 80, 0, 0): 0.0,
  (0, 30, 0, 0

# MC Policy Control
To find optimal set of weights to invest in a portfolio with 2 stocks

In [None]:
import random
from tqdm import tqdm

In [None]:
def calculateReward(state, action, nextState, date, prevDate):
  reward = 0.0
  for s, a, sNew in zip(state, action, nextState):
    if a != 0:
      stockReturn = 0
      investedMoney = (a/100)*investmentAmount
      chosenStock = list(stockIndex.keys())[list(stockIndex.values()).index(action.index(a))]
      chosenStockData = data[chosenStock]
      open = list(chosenStockData[chosenStockData['Date'] == prevDate]['Open'])[0]
      close = list(chosenStockData[chosenStockData['Date'] == date]['Close'])[0]
      shares = investedMoney/open
      transactionCost = (0.05)*open
      stockReturn = shares*(close-(open+transactionCost))
      reward += stockReturn

  return reward

In [None]:
def generateEpisode():
  # returns a list of t time steps each with 
  # (state, action, reward, next state)
  episode = []
  for i in range(len(states)-1):
    state = tuple(states.loc[i].values.flatten().tolist()[1:])
    prevDate = states.loc[i].values.flatten().tolist()[0]
    action = random.choice(actions)
    nextState = tuple(states.loc[i+1].values.flatten().tolist()[1:])
    date = states.loc[i+1].values.flatten().tolist()[0]
    reward = calculateReward(state, action, nextState, date, prevDate)
    currentTimeStep = [state, action, reward, nextState]
    episode.append(currentTimeStep)
  
  return episode

In [None]:
def calculateReturn(episode, stateIndex, gamma=0.4):
  portfolioReturn = 0
  for i in range(len(episode)):
    portfolioReturn += ((gamma**(i-stateIndex))*episode[i][2])
  return portfolioReturn

**Initialising values before learning**

In [None]:
investmentAmount = 10000
epochs = 100
finalCols = [str(action) for action in actions]
statesList = [tuple(states.loc[i].values.flatten().tolist()[1:]) for i in range(len(states)-1)]
alpha = 0.2

In [None]:
W = np.round(W, 3)

In [None]:
W

array([[0.32 , 0.565, 0.463, 0.479, 0.914],
       [0.571, 0.77 , 0.782, 0.91 , 0.219],
       [0.004, 0.827, 0.768, 0.172, 0.121],
       [0.154, 0.349, 0.347, 0.062, 0.282],
       [0.18 , 0.996, 0.085, 0.639, 0.96 ],
       [0.066, 0.029, 0.725, 0.486, 0.895],
       [0.782, 0.5  , 0.276, 0.647, 0.832],
       [0.865, 0.378, 0.749, 0.462, 0.97 ],
       [0.444, 0.678, 0.714, 0.162, 0.883],
       [0.348, 0.662, 0.827, 0.559, 0.221],
       [0.727, 0.681, 0.652, 0.954, 0.925],
       [0.978, 0.775, 0.23 , 0.509, 0.56 ],
       [0.727, 0.997, 0.875, 0.617, 0.296],
       [0.221, 0.044, 0.757, 0.89 , 0.76 ],
       [0.53 , 0.949, 0.437, 0.806, 0.943],
       [0.1  , 0.858, 0.85 , 0.142, 0.336],
       [0.617, 0.89 , 0.05 , 0.662, 0.731],
       [0.011, 0.404, 0.936, 0.104, 0.116],
       [0.58 , 0.432, 0.759, 0.749, 0.748],
       [0.133, 0.459, 0.102, 0.711, 0.491],
       [0.42 , 0.294, 0.195, 0.897, 0.543],
       [0.076, 0.236, 0.139, 0.595, 0.607],
       [0.81 , 0.109, 0.643, 0.3

In [None]:
for epoch in tqdm(range(epochs)):
  # genearting episode
  currentEpisode = generateEpisode()

  # calculating discounted return based on FVMC
  # and updating weights 
  for i in range(len(currentEpisode)):
    discountedPotfolioReturn = calculateReturn(currentEpisode[i:], i)
    discountedPotfolioReturn = round(discountedPotfolioReturn, 3)
    currentAction = currentEpisode[i][1]
    currentState = np.asarray(currentEpisode[i][0])
    currentState = np.round(currentState, 3)
    # updating weights
    actionIndex = actions.index(currentAction)
    weightForAction = W[actionIndex]
    deltaW = np.array(alpha*(discountedPotfolioReturn - np.dot(weightForAction, currentState))*currentState)
    deltaW = deltaW.astype('float64')
    deltaW = np.round(deltaW, 3)
    weightForAction += deltaW
    W[actionIndex] = weightForAction
    W = np.round(W, 3)

100%|██████████| 100/100 [00:08<00:00, 11.18it/s]


In [None]:
W = np.nan_to_num(W) 

In [None]:
W

array([[-1.57407232e+152, -2.70362855e+153, -1.91457394e+152,
        -2.57568687e+153, -3.48043605e+152],
       [ 6.12981676e+129,  1.20128339e+131,  7.99376008e+129,
         1.02149314e+131,  1.42005673e+130],
       [-6.40941164e+151, -1.15363853e+153, -7.87807064e+151,
        -1.03528185e+153, -1.44138011e+152],
       [-1.00120499e+202, -1.88674351e+203, -1.27440097e+202,
        -1.62385632e+203, -2.14861423e+202],
       [ 1.32914602e+183,  2.46181134e+184,  1.67508290e+183,
         2.17911334e+184,  3.15151232e+183],
       [ 1.69033371e+156,  3.28345457e+157,  2.16952927e+156,
         2.83752190e+157,  3.86713462e+156],
       [-1.45831934e+125, -2.82068292e+126, -1.83931378e+125,
        -2.38936608e+126, -3.24812060e+125],
       [ 1.64144835e+130,  3.02478305e+131,  2.00926790e+130,
         2.73494087e+131,  3.70447589e+130],
       [ 5.56302661e+185,  9.74437300e+186,  6.67876289e+185,
         8.88891340e+186,  1.22394156e+186],
       [-2.20848995e+116, -4.18789239

In [None]:
x = X.iloc[-1]
print(x)
Z = np.dot(W, x)

AAPL     178.157501
AMZN    3346.304993
FB       228.655003
GOOG    2855.815002
NFLX     385.587502
Name: 21, dtype: float64


In [None]:
softmaxValues = softmax(Z)
softmaxValues = np.round(softmaxValues, 4)

In [None]:
softmaxValues

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [None]:
sum(softmaxValues)

1.0

In [None]:
softmaxValues = list(softmaxValues)
softmaxValues.index(max(softmaxValues))

28

In [None]:
optimalPortfolioWeights = list(actions[softmaxValues.index(max(softmaxValues))])
optimalPortfolioWeights

[70, 0, 0, 0, 30]

In [None]:
optimalStocksIndex = [i for i in range(len(optimalPortfolioWeights)) if int(optimalPortfolioWeights[i])!=0 ]
optimalStocksIndex

[0, 4]

In [None]:
for index in optimalStocksIndex:
  stockName = list(stockIndex.keys())[list(stockIndex.values()).index(index)]
  print("Invest ", optimalPortfolioWeights[index], "% in ", stockName)

Invest  70 % in  AAPL
Invest  30 % in  NFLX
