In [1]:
from tensorflow.python.client import device_lib
from tensorflow.keras import layers
import gymnasium as gym
#from gym import spaces
import tensorflow as tf
import cellrank as cr
import scanpy as sc
import pandas as pd
import numpy as np
#import gym
# Training the agent with DQN
#from stable_baseline3 import DQN 

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [5]:
# Defining the Reinforcement learning environment
class LatentSpaceEnv(gym.Env):
    
    def __init__(self, action_df, target_point):
        super(LatentSpaceEnv, self).__init__()

        # The action_df describes how each perturbation affects each latent variable
        self.action_df = action_df

        # Defining the total number of possible perturbations/actions
        self.action_space = gym.spaces.Discrete(len(action_df))

        
        # The observation space is a 6-dimensional continuous space representing latent variables
        # Need to create an automated way to fill in the low and high values based on absolute max of latent embeddings
        self.observation_space = gym.spaces.Box(low=-7, high=7, shape=(6,), dtype=np.float32)

        # Initial state of the system, randomly initialized within the space
        self.current_state = np.random.rand(6)

        # The target point, representing the desired position in latent space, e.g., a cluster center
        self.target_point = target_point

        # Optional: Define the reward range if necessary
        self.reward_range = (-np.inf, 0)  # Negative rewards, closer to 0 is better

    def step(self, action):
        
        # Apply the selected action to the current state
        action_effect = self.action_df.iloc[action].values
        self.current_state += action_effect
    
        # Calculate the Euclidean distance from the current state to the target point
        # try different distance metrics
        distance_to_target = np.linalg.norm(self.current_state - self.target_point)
    
        # Penalty for each step taken
        # Try different values for this
        step_penalty = 0.1  
        
        # Reward for reaching the target
        reward = -distance_to_target - step_penalty  # Adding step penalty
        
        # Stopping after a certain distance to taregt is reached
        done = distance_to_target < 0.4

        # 
        if done:
            if len(self.perturbations) <= 6:
                reward += 10  # Bonus reward for reaching within 6 steps
            else:
                reward += 1  # Smaller reward if it took more than 6 steps
    
        return self.current_state, reward, done, {}

    def reset(self):
        
        # Reset the environment to a new random state
        self.current_state = np.random.rand(6)
        return self.current_state


class SimpleDQN:
    
    def __init__(self, observation_space, action_space, action_df, alpha, gamma, epsilon):
        self.observation_space = observation_space
        self.action_space = action_space
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        self.epsilon = epsilon  # exploration rate
        self.action_df = action_df  # store the action_df
        self.model = self.create_model()

    def create_model(self):
        model = tf.keras.Sequential()
        model.add(layers.Input(shape=(self.observation_space.shape[0],)))

        # Test how different architectures change run time
        model.add(layers.Dense(40, activation='relu'))
        model.add(layers.Dense(self.action_space.n, activation='linear'))
        model.compile(optimizer=tf.keras.optimizers.Adam(), loss='mse')
        return model

    def choose_action(self, state):
        
        if np.random.uniform() < self.epsilon:
            action = np.random.choice(self.action_space.n)
        
        else:
            state = np.reshape(state, [1, self.observation_space.shape[0]])
            action_values = self.model.predict(state)
            action = np.argmax(action_values[0])
            
        return action

    def learn(self, state, action, reward, next_state, done):
        
        state = np.reshape(state, [1, self.observation_space.shape[0]])
        
        next_state = np.reshape(next_state, [1, self.observation_space.shape[0]])

        # Doesn't make sense that reward = target here
        target = reward
        
        if not done:
            next_action_values = self.model.predict(next_state)
            target += self.gamma * np.max(next_action_values[0])

        target_f = self.model.predict(state)
        target_f[0][action] = target

        self.model.fit(state, target_f, epochs=1, verbose=0)

    def get_perturbations_to_target(self, initial_state, target_state, max_steps=50):
        current_state = initial_state
        perturbations = []
        for _ in range(max_steps):
            action = self.choose_action(current_state)
            next_state = self.apply_action(current_state, action)
            perturbations.append(action)
            
            if np.allclose(next_state, target_state, atol=1e-2):
                break

            current_state = next_state

        return perturbations

    def apply_action(self, state, action):
        # Retrieve the action effect based on the action_df
        action_effect = self.action_df.iloc[action].values
        
        # Apply the action effect to the current state to get the next state
        next_state = state + action_effect
        
        return next_state


In [None]:

# Reading in the action_df from svae+
action_df = pd.read_csv('/Users/brendamelano/Desktop/RL_for_reprogramming/action_df.csv', index_col=0)

# Creating the environment based on the class that I built for cluster 5
target_point = np.array([-0.9369193 , -0.447852  , -0.45398346,  1.777766  ,  0.32149878,
       -0.60258126])


# Creating the latent space environment
env = LatentSpaceEnv(action_df=action_df, target_point=target_point)


# Initialize DQN agent parameters
alpha = 0.2  # Learning rate.. how exactly does this work algorithmically?
gamma = 0.3   # Discount factor
epsilon = 0.1  # Exploration rate


# Create DQN agent
dqn_agent = SimpleDQN(observation_space=env.observation_space,
                      action_space=env.action_space,
                      action_df=action_df,  # Pass action_df to the agent
                      alpha=alpha,
                      gamma=gamma,
                      epsilon=epsilon)

# Training loop parameters
num_episodes = 20
max_steps_per_episode = 100


# Training loop
for episode in range(num_episodes):
    
    # Reset the environment at the start of each episode
    state = env.reset()
    total_reward = 0
    
    for step in range(max_steps_per_episode):
        
        # Choose an action
        action = dqn_agent.choose_action(state)
        
        # Take action in the environment
        next_state, reward, done, _ = env.step(action)
        
        # Learn from the experience
        dqn_agent.learn(state, action, reward, next_state, done)
        
        # Update the current state
        state = next_state
        
        # Accumulate the reward
        total_reward += reward
        
        # End the episode if done
        if done:
            break
    
    # Print episode statistics
    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}, Steps: {step + 1}")

print("Training completed.")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1

In [7]:
# Test the agent with initial state from cluster 2
initial_state = np.array([0.5738455 ,  0.6803657 ,  0.52349895, -0.96093357, -1.05216   ,
       -0.6411334])

target_state = env.target_point
perturbations = dqn_agent.get_perturbations_to_target(initial_state, target_state)

print("Perturbations to target:", perturbations)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1

In [8]:
number_list = [290, 144, 389, 487, 378, 651, 627, 651, 627, 318, 597, 51, 336, 518, 336, 93, 93, 93, 597, 93, 93, 29, 438, 438, 453, 93, 93, 93, 93, 93, 135, 513, 432, 184, 501, 501, 518, 518, 518, 317, 596, 317, 518, 317, 596, 317, 501, 178, 501, 178]

# Extract unique values from the list
unique_numbers = list(set(number_list))


# Assuming action_df is already read in from a CSV or created
# action_df = pd.read_csv('/path/to/action_df.csv')


# Select the rows from action_df based on the unique indices
selected_rows = action_df.iloc[unique_numbers]


# Print the selected rows
print(selected_rows)

         Latent_0  Latent_1  Latent_2  Latent_3  Latent_4  Latent_5
RPS15A  -0.692921 -0.357513 -0.164269  0.723772  0.072576 -0.145721
PDCD11  -1.268080 -0.599613 -0.773501  2.129370  0.124935 -0.592556
RPS23   -0.676829 -0.346698 -0.315413  0.684083  0.071434 -0.160900
EIF3G   -0.370982 -0.543809  0.074646  0.081183  0.067712 -0.865696
VPS28    0.792665  0.492349  0.524672  0.014304  0.117679 -0.236076
ENO1     0.011491  0.100250 -0.177479 -1.201127  0.224771  0.314871
BCAS2    0.086486  0.084760 -0.008747 -0.487085  0.418158 -0.526808
MED21    0.819884  0.587833 -0.093482  0.792575  0.492456  0.687769
PRPF19  -0.115619  0.085245 -0.003076 -0.230626  0.314355 -0.272154
GEMIN5   0.100848  1.087689  1.022816 -0.366746  0.707550 -0.194187
CCDC86  -0.427463 -0.264937 -0.044944  0.725017  0.375757 -1.027535
PSMD2    0.214064  0.412642  0.142055  0.433169  0.410455 -0.000469
GINS2    0.772059  0.034246  0.216064 -0.621370  0.690350  0.926288
MRPL16   0.388170  0.186877  0.272866 -1.147392 

In [10]:
selected_rows = action_df[action_df.index.isin(unique_numbers)]

# Print the selected rows
print(selected_rows)

NameError: name 'unique_numbers' is not defined