In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from copy import copy

In [None]:
nifty50_index_symbol = '^NSEI'

start_date = '2010-01-01'
end_date = '2019-06-30'

data = yf.download(nifty50_index_symbol, start=start_date, end=end_date)

In [None]:
data.describe()

In [None]:
def normalisation(col):
    new_col = (col - min(col))/(max(col) - min(col)) + 0.01  # for non zero stock prices
    return new_col

In [None]:
df = data[['Close']]

In [None]:
for col in df.columns:
    df[col] = normalisation(df[col])

In [None]:
df[:600]

In [None]:
data_arr = df[:600].to_numpy()

In [None]:
data_arr = data_arr.reshape((data_arr.shape[0], ))

In [None]:
data_arr.shape

In [None]:
# data_arr = data_arr[:600]

In [None]:
plt.plot(data_arr)

In [None]:
class StockMarketEnvironment:
    # action == 0-buy, 1-sell, 2-hold
    def __init__(self, starting_balance, data):
        self.data = data
        self.starting_balance = starting_balance
        self.balance = starting_balance
        self.stocks = 0
        self.portfolio = self.balance
        self.time_step = 0
        self.end = False
        self.state = np.array([self.balance, self.stocks, self.data[self.time_step]])
    
    def reset(self):
        self.balance = self.starting_balance
        self.stocks = 0
        self.portfolio = self.balance
        self.time_step = 0
        self.end = False
        self.state = np.array([self.balance, self.stocks, self.data[self.time_step]])

    def step(self, action):
        curr_state = self.state
        if action < 0:
            num_poss_buy = self.balance / curr_state[2]
            buy = num_poss_buy * abs(action)
            self.stocks += buy
            self.balance -= buy * curr_state[2]
        if action > 0:
            num_poss_sell = self.stocks
            sell = num_poss_sell * action
            self.stocks -= sell
            self.balance += sell * curr_state[2]
            
        next_time_state = self.time_step + 1
        if next_time_state >= self.data.shape[0] - 1:
            self.end = True
        next_state = np.array([self.balance, self.stocks, self.data[next_time_state]])

        self.portfolio = self.balance + self.stocks * curr_state[2]
        next_portfolio = self.balance + self.stocks * next_state[2]

        reward = next_portfolio - self.portfolio

        self.state = next_state
        self.portfolio = next_portfolio
        self.time_step += 1

        return (self.state, reward, self.end)
    
    def is_end(self):
        return self.end

In [None]:
def get_actor():
    inputs = tf.keras.layers.Input(shape = (3, ))
    out = tf.keras.layers.Dense(32, activation = 'relu')(inputs)
    out = tf.keras.layers.Dense(64, activation = 'relu')(out)
    out = tf.keras.layers.Dense(64, activation = 'relu')(out)
    out = tf.keras.layers.Dense(32, activation = 'relu')(out)
    outputs = tf.keras.layers.Dense(1, activation = 'tanh')(out)
    model = tf.keras.Model(inputs, outputs)
    return model

def get_critic():
    state_input = tf.keras.layers.Input(shape = (3, ))
    out = tf.keras.layers.Dense(32, activation = 'relu')(state_input)
    state_out = tf.keras.layers.Dense(64, activation = 'relu')(out)
    actor_input = tf.keras.layers.Input(shape = (1, ))
    actor_out = tf.keras.layers.Dense(64, activation = 'relu')(actor_input)
    concat = tf.keras.layers.Concatenate()([state_out, actor_out])
    total_out = tf.keras.layers.Dense(256, activation = 'relu')(concat)
    total_out = tf.keras.layers.Dense(256, activation = 'relu')(total_out)
    outputs = tf.keras.layers.Dense(1)(total_out)
    model = tf.keras.Model([state_input, actor_input], outputs)
    return model

In [None]:
actor_model = get_actor()
target_actor = copy(actor_model)
critic_model = get_critic()
target_critic = copy(critic_model)

actor_optimizer = tf.keras.optimizers.Adam(0.001)
critic_optimizer = tf.keras.optimizers.Adam(0.001)

In [None]:
epsilon = 1
epsilon_decay = 0.95
buffer_size = 400
buffer = []
max_iters = 100
gamma = 0.9

In [None]:
data_arr.shape[0]

In [None]:
balance = np.mean(data_arr)
env = StockMarketEnvironment(balance, data_arr)

In [None]:
np.array(env.state).shape

In [None]:
action = target_actor.predict(curr_state.reshape((-1, 3)), verbose = 0)
action[0][0]

In [None]:
critic_loss_history = []
actor_loss_history = []
rewards_history = []
for i in range(max_iters + 400):
    action_count = []
    env.reset()
    rewards_sum = 0
    while not env.is_end():
        curr_state = env.state
        random_num = np.random.rand()
        if random_num < epsilon:
            action = np.random.uniform(-1, 1)
        else:
            action = target_actor.predict(curr_state.reshape((-1, 3)), verbose = 0)
            action = action.item()
        action_count.append(action)
        next_state, reward, end = env.step(action)
        rewards_sum += reward

        quadruple = (curr_state, action, reward, next_state, end)
        buffer.append(quadruple)
        if len(buffer) > buffer_size:
            buffer.pop(0)

    rewards_history.append(rewards_sum)
    
    sample_indices = np.random.choice(len(buffer), size=min(50, len(buffer)), replace=False)
    train_sample = [buffer[idx] for idx in sample_indices]
    epsilon = max(epsilon * epsilon_decay, 0.1)

    total_critic_loss = 0
    total_actor_loss = 0
    for quadrup in train_sample:
        curr_state, action, reward, next_state, end = quadrup

        # critic taining
        with tf.GradientTape() as tape:
            next_state_action = target_actor(tf.convert_to_tensor(next_state.reshape((-1, 3))), training = True)
            next_state_q_vals = target_critic([tf.convert_to_tensor(next_state.reshape((-1, 3))), next_state_action], training = True)
            target_q_val = reward + gamma * next_state_q_vals
            curr_q_val = critic_model([tf.convert_to_tensor(curr_state.reshape((-1, 3))), tf.convert_to_tensor(np.array(action).reshape((-1, 1)))], training = True)
            critic_loss = tf.square(target_q_val - curr_q_val)
        critic_grads = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_grads, critic_model.trainable_variables))
        total_critic_loss += critic_loss

        #actor training
        with tf.GradientTape() as tape:
            actor_curr_act = actor_model(tf.convert_to_tensor(curr_state.reshape((-1, 3))), training = True)
            actor_curr_q_val = critic_model([tf.convert_to_tensor(curr_state.reshape((-1, 3))), actor_curr_act], training = True)
            actor_loss = -1 * actor_curr_q_val
        actor_grads = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(zip(actor_grads, actor_model.trainable_variables))
        total_actor_loss += actor_loss

    
    total_critic_loss /= len(train_sample)
    critic_loss_history.append(total_critic_loss)
    total_actor_loss /= len(train_sample)
    actor_loss_history.append(total_actor_loss)

    target_critic.set_weights(critic_model.get_weights())
    target_actor.set_weights(actor_model.get_weights())
    target_actor.save_weights("actor_trained.weights.h5")
    target_critic.save_weights("critic_trained.weights.h5")

    print(i, "-> CRITIC LOSS:", total_critic_loss.numpy(), "-> ACTOR LOSS:", total_actor_loss.numpy(), "-> REWARD:", rewards_sum, "\n-> Action count:", action_count)


In [None]:
target_actor.save_weights("best_actor_trained.weights.h5")
target_critic.save_weights("best_critic_trained.weights.h5")

In [None]:
plt.plot(rewards_history)

In [None]:
plt.plot(critic_loss_history)

In [None]:
action_count = []
env.reset()
rewards_sum = 0
stock_history = []
while not env.is_end():
    curr_state = env.state
    action = target_actor.predict(curr_state.reshape((-1, 3)), verbose = 0)
    action = action.numpy().item()
    action_count.append(action)
    next_state, reward, end = env.step(action.numpy().item())
    rewards_sum += reward
    stock_history.append(env.stocks)
print(rewards_sum)
print(stock_history)

In [None]:
print(env.starting_balance)
print(env.portfolio)