<a href="https://colab.research.google.com/github/abhinavarorags/CoolStuff/blob/test/PolicyGradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import deque
import random

# Hyperparameters
episodes = 10
learning_rate = 0.001
batch_size = 64
memory_size = 10000


def load_data(file_path, url='https://raw.githubusercontent.com/abhinavarorags/CoolStuff/refs/heads/test/sample_data.csv'):
    # Load dataset
    try:
        data = pd.read_csv(file_path)
    except FileNotFoundError:
        # If file not found, download from URL
        data = pd.read_csv(url)
    # Splitting dataset into training and testing
    train_data = data.iloc[:20000]
    test_data = data.iloc[20000:]
    return train_data, test_data


# Load dataset
file_path = '/mnt/data/sample_data.csv'
train_data, test_data = load_data(file_path)

# Placeholder dataset parameters
state_space = train_data.shape[1] - 1  # Number of features in state (assuming last column is the action/reward)
action_space = 3  # Number of possible actions (Buy Long, Sell Short, Hold)


# Neural Network for Policy Gradient Reinforcement Learning
class PolicyGradient(tf.keras.Model):
    def __init__(self, action_space):
        super(PolicyGradient, self).__init__()
        self.fc1 = tf.keras.layers.Dense(24, activation='relu')
        self.fc2 = tf.keras.layers.Dense(24, activation='relu')
        self.out = tf.keras.layers.Dense(action_space, activation='softmax')

    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return self.out(x)


# Experience Replay Memory
memory = deque(maxlen=memory_size)


def remember(state, action, reward):
    memory.append((state, action, reward))


def act(model, state):
    state = np.array([state])
    probs = model(state).numpy()[0]
    return np.random.choice(action_space, p=probs)


def train_model(model, optimizer):
    for e in range(episodes):
        sample = train_data.sample()  # Randomly sample a row from training data
        state = sample.select_dtypes(include=[np.number]).iloc[:, :state_space].values.flatten()  # Use only numeric features as state
        episode_memory = []
        total_reward = 0
        done = False
        while not done:
            action = act(model, state)
            next_sample = train_data.sample()  # Get next state from training data
            next_state = next_sample.select_dtypes(include=[np.number]).iloc[:, :state_space].values.flatten()
            if action == 0:  # Buy Long
                reward = float(next_sample.select_dtypes(include=[np.number]).iloc[0, -1]) - float(sample.select_dtypes(include=[np.number]).iloc[0, -1])  # Positive reward if price goes up
            elif action == 1:  # Sell Short
                reward = float(sample.select_dtypes(include=[np.number]).iloc[0, -1]) - float(next_sample.select_dtypes(include=[np.number]).iloc[0, -1])  # Positive reward if price goes down
            else:  # Hold
                reward = -0.1  # Small penalty to discourage holding if unnecessary
            episode_memory.append((state, action, reward))
            state = next_state
            total_reward += reward
            done = np.random.rand() > 0.95  # Randomly ending the episode

        # Update policy
        with tf.GradientTape() as tape:
            loss = 0
            for state, action, reward in episode_memory:
                state = np.array([state])
                probs = model(state)
                action_prob = probs[0, action]
                loss -= tf.math.log(action_prob) * reward  # Negative log likelihood weighted by reward
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward:.2f}")
    print("Training completed.")


def test_model(model, test_episodes=100):
    correct_predictions = {"Buy Long": 0, "Sell Short": 0, "Hold": 0}
    wrong_predictions = {"Buy Long": 0, "Sell Short": 0, "Hold": 0}

    def test_act(model, state):
        state = np.array([state])
        probs = model(state).numpy()[0]
        return np.argmax(probs)

    for e in range(test_episodes):
        sample = test_data.sample()  # Randomly sample a row from testing data
        state = sample.select_dtypes(include=[np.number]).iloc[:, :state_space].values.flatten()  # Use only numeric features as state
        done = False
        while not done:
            action = test_act(model, state)
            next_sample = test_data.sample()  # Get next state from testing data
            next_state = next_sample.select_dtypes(include=[np.number]).iloc[:, :state_space].values.flatten()
            if action == 0:  # Buy Long
                reward = float(next_sample.select_dtypes(include=[np.number]).iloc[0, -1]) - float(sample.select_dtypes(include=[np.number]).iloc[0, -1])  # Positive reward if price goes up
            elif action == 1:  # Sell Short
                reward = float(sample.select_dtypes(include=[np.number]).iloc[0, -1]) - float(next_sample.select_dtypes(include=[np.number]).iloc[0, -1])  # Positive reward if price goes down
            else:  # Hold
                reward = -0.1  # Small penalty to discourage holding if unnecessary
            done = np.random.rand() > 0.95  # Randomly ending the episode
            action_name = ["Buy Long", "Sell Short", "Hold"][action]
            if reward > 0.7:  # Assuming a reward above 0.7 is a correct prediction
                correct_predictions[action_name] += 1
            else:
                wrong_predictions[action_name] += 1
            state = next_state

    print("Test Data saved with recommendations")
    print("\t\tCorrect Predictions  Wrong Predictions")
    for action in ["Buy Long", "Sell Short", "Hold"]:
        print(f"{action}\t\t{correct_predictions[action]}\t\t\t{wrong_predictions[action]}")


# Load, Train and Test Model
model = PolicyGradient(action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_model(model, optimizer)
test_model(model)


Episode 1/10, Total Reward: -1.66
Episode 2/10, Total Reward: -6.88
Episode 3/10, Total Reward: -3.62
Episode 4/10, Total Reward: 2.84
Episode 5/10, Total Reward: -1.02
Episode 6/10, Total Reward: -0.57
Episode 7/10, Total Reward: -4.18
Episode 8/10, Total Reward: 0.96
Episode 9/10, Total Reward: -0.26
Episode 10/10, Total Reward: -1.15
Training completed.
Test Data saved with recommendations
		Correct Predictions  Wrong Predictions
Buy Long		0			1
Sell Short		5			1848
Hold		0			265


In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import deque
import random

# Hyperparameters
episodes = 10
learning_rate = 0.001
batch_size = 64
memory_size = 10000


def load_data(file_path, url='https://raw.githubusercontent.com/abhinavarorags/CoolStuff/refs/heads/test/sample_data.csv'):
    # Load dataset
    try:
        data = pd.read_csv(file_path)
    except FileNotFoundError:
        # If file not found, download from URL
        data = pd.read_csv(url)
    # Splitting dataset into training and testing
    train_data = data.iloc[:20000]
    test_data = data.iloc[20000:]
    return train_data, test_data


# Load dataset
file_path = '/mnt/data/sample_data.csv'
train_data, test_data = load_data(file_path)

# Placeholder dataset parameters
state_space = train_data.shape[1] - 1  # Number of features in state (assuming last column is the action/reward)
action_space = 3  # Number of possible actions (Buy Long, Sell Short, Hold)


# Neural Network for RNN-based Reinforcement Learning
class RNNPolicyGradient(tf.keras.Model):
    def __init__(self, action_space):
        super(RNNPolicyGradient, self).__init__()
        self.lstm = tf.keras.layers.LSTM(24, return_sequences=False, return_state=False)
        self.fc1 = tf.keras.layers.Dense(24, activation='relu')
        self.out = tf.keras.layers.Dense(action_space, activation='softmax')

    def call(self, x):
        x = self.lstm(x)
        x = self.fc1(x)
        return self.out(x)


# Experience Replay Memory
memory = deque(maxlen=memory_size)


def remember(state, action, reward):
    memory.append((state, action, reward))


def act(model, state):
    state = np.array([state])
    state = state.reshape((1, state.shape[-1], 1))  # Reshape for LSTM input
    probs = model(state).numpy()[0]
    return np.random.choice(action_space, p=probs)


def train_model(model, optimizer):
    for e in range(episodes):
        sample = train_data.sample()  # Randomly sample a row from training data
        state = sample.select_dtypes(include=[np.number]).iloc[:, :state_space].values.flatten()  # Use only numeric features as state
        episode_memory = []
        total_reward = 0
        done = False
        while not done:
            action = act(model, state)
            next_sample = train_data.sample()  # Get next state from training data
            next_state = next_sample.select_dtypes(include=[np.number]).iloc[:, :state_space].values.flatten()
            if action == 0:  # Buy Long
                reward = float(next_sample.select_dtypes(include=[np.number]).iloc[0, -1]) - float(sample.select_dtypes(include=[np.number]).iloc[0, -1])  # Positive reward if price goes up
            elif action == 1:  # Sell Short
                reward = float(sample.select_dtypes(include=[np.number]).iloc[0, -1]) - float(next_sample.select_dtypes(include=[np.number]).iloc[0, -1])  # Positive reward if price goes down
            else:  # Hold
                reward = -0.1  # Small penalty to discourage holding if unnecessary
            episode_memory.append((state, action, reward))
            state = next_state
            total_reward += reward
            done = np.random.rand() > 0.95  # Randomly ending the episode

        # Update policy
        with tf.GradientTape() as tape:
            loss = 0
            for state, action, reward in episode_memory:
                state = np.array([state])
                state = state.reshape((1, state.shape[-1], 1))  # Reshape for LSTM input
                probs = model(state)
                action_prob = probs[0, action]
                loss -= tf.math.log(action_prob) * reward  # Negative log likelihood weighted by reward
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward:.2f}")
    print("Training completed.")


def test_model(model, test_episodes=100):
    correct_predictions = {"Buy Long": 0, "Sell Short": 0, "Hold": 0}
    wrong_predictions = {"Buy Long": 0, "Sell Short": 0, "Hold": 0}

    def test_act(model, state):
        state = np.array([state])
        state = state.reshape((1, state.shape[-1], 1))  # Reshape for LSTM input
        probs = model(state).numpy()[0]
        return np.argmax(probs)

    for e in range(test_episodes):
        sample = test_data.sample()  # Randomly sample a row from testing data
        state = sample.select_dtypes(include=[np.number]).iloc[:, :state_space].values.flatten()  # Use only numeric features as state
        done = False
        while not done:
            action = test_act(model, state)
            next_sample = test_data.sample()  # Get next state from testing data
            next_state = next_sample.select_dtypes(include=[np.number]).iloc[:, :state_space].values.flatten()
            if action == 0:  # Buy Long
                reward = float(next_sample.select_dtypes(include=[np.number]).iloc[0, -1]) - float(sample.select_dtypes(include=[np.number]).iloc[0, -1])  # Positive reward if price goes up
            elif action == 1:  # Sell Short
                reward = float(sample.select_dtypes(include=[np.number]).iloc[0, -1]) - float(next_sample.select_dtypes(include=[np.number]).iloc[0, -1])  # Positive reward if price goes down
            else:  # Hold
                reward = -0.1  # Small penalty to discourage holding if unnecessary
            done = np.random.rand() > 0.95  # Randomly ending the episode
            action_name = ["Buy Long", "Sell Short", "Hold"][action]
            if reward > 0.7:  # Assuming a reward above 0.7 is a correct prediction
                correct_predictions[action_name] += 1
            else:
                wrong_predictions[action_name] += 1
            state = next_state

    print("Test Data saved with recommendations")
    print("\t\tCorrect Predictions  Wrong Predictions")
    for action in ["Buy Long", "Sell Short", "Hold"]:
        print(f"{action}\t\t{correct_predictions[action]}\t\t\t{wrong_predictions[action]}")


# Load, Train and Test Model
model = RNNPolicyGradient(action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_model(model, optimizer)
test_model(model)


Episode 1/10, Total Reward: 0.38
Episode 2/10, Total Reward: -2.31
Episode 3/10, Total Reward: -0.32
Episode 4/10, Total Reward: 0.16
Episode 5/10, Total Reward: -2.05
Episode 6/10, Total Reward: -0.20
Episode 7/10, Total Reward: -0.93
Episode 8/10, Total Reward: -0.44
Episode 9/10, Total Reward: -0.23
Episode 10/10, Total Reward: -0.43
Training completed.
Test Data saved with recommendations
		Correct Predictions  Wrong Predictions
Buy Long		0			1
Sell Short		0			1721
Hold		0			0
