In [None]:
%pip install transformers torch

In [3]:


from transformers import AutoModelForCausalLM, AutoTokenizer

model_name_or_path = "fblgit/juanako-7b-UNA"
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    low_cpu_mem_usage=True,
    # device_map="cuda:0"
)
# Create the tokenizer from the model object
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# print(llm("AI is going to"))


Loading checkpoint shards: 100%|██████████| 3/3 [00:29<00:00,  9.67s/it]


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np

torch.set_default_tensor_type('torch.FloatTensor')

class DQNNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        # print("Network forward: Input shape:", x.shape)  # Should be [1, 512]
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class DQNAgent:
    def __init__(self, state_size, action_size, hidden_size, learning_rate, gamma):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = DQNNetwork(state_size, action_size, hidden_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def select_action(self, state):
        # print("select_action: Original shape of state:", state.shape)
        state = state.float()
        state_tensor = pad_or_truncate(state, max_length=512)
        # # If state is a 2D tensor of shape [1, 512], no need to average across the last dimension
        # if state.ndim == 2 and state.shape[1] == 512:
        #     state_tensor = state
        # elif state.ndim == 2:
        #     # If state is a 2D tensor but not of the correct size, pad or truncate
        #     state_tensor = pad_or_truncate(state, max_length=512)
        # else:
        #     # If state is not a 2D tensor, compute the embeddings
        #     state_tensor = pad_or_truncate(state, max_length=512)
        state_tensor = state_tensor.unsqueeze(0)  # Ensure it has a batch dimension
        # print("select_action: Shape of state_tensor before passing to the network:", state_tensor.shape)

        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            with torch.no_grad():
                q_values = self.model(state_tensor)
            return np.argmax(q_values.cpu().detach().numpy())


    def update(self, state, action, reward, next_state, done):
        # Ensure state and next_state are float tensors
        state = pad_or_truncate(state, max_length=512)
        state = state.float()
        next_state = next_state.float()

        # Reshape next_state if not in correct shape
        if next_state.ndim == 2 and next_state.shape[1] != 512:
            # Handle reshaping here, maybe pad or truncate
            next_state = pad_or_truncate(next_state, max_length=512)

        # Ensure the tensors are of the correct shape
        # print("Update: Shape of next_state_tensor:", next_state.shape)
        
        reward = torch.tensor(reward, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        q_update = reward + (self.gamma * torch.max(self.model(next_state)) * (1 - done))
        q_values = self.model(state)
        q_values[0][action] = q_update

        loss = F.mse_loss(q_values, self.model(state))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


def environment_step(model, input_ids, action):
    # Append the action (token) to the input sequence
    input_ids = torch.cat((input_ids, torch.tensor([[action]])), dim=1)

    # Generate output from the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
    logits = outputs.logits

    # Get the next token (you might want a different approach here)
    next_token_id = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)

    # Define a reward mechanism (this is highly task-specific)
    reward = compute_reward(input_ids, next_token_id)

    # Check if the end of sequence token is generated
    done = next_token_id.item() == tokenizer.eos_token_id

    return input_ids, reward, done

def compute_perplexity(sequence):
    """ Compute the perplexity of the generated sequence.
    :param sequence: The generated text sequence.
    :return: A numerical value representing the perplexity.
    """
    # Tokenize the sequence
    inputs = tokenizer(sequence, return_tensors="pt")

    # Generate output from the model
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])

    # Perplexity is e^loss
    perplexity = torch.exp(outputs.loss)

    return perplexity.item()

def compute_reward(input_ids, next_token_id, target_context=None):
    """ Compute the reward for the generated sequence.
    :param input_ids: Tensor of input token IDs.
    :param next_token_id: Tensor of the next token ID generated.
    :param target_context: Optional context or topic that the generated text should align with.
    :return: A numerical reward value.
    """
    # Convert token IDs to text
    sequence = tokenizer.decode(input_ids[0])

    # Fluency reward
    fluency_reward = compute_fluency(sequence)

    # Relevance reward (if target_context is provided)
    relevance_reward = 0
    if target_context:
        relevance_reward = compute_relevance(sequence, target_context)

    # Diversity reward
    diversity_reward = diversity(sequence)

    # Perplexity reward
    perplexity_reward = -compute_perplexity(sequence)  # We negate it because lower perplexity is better

    # Aggregate reward
    total_reward = fluency_reward + relevance_reward + diversity_reward + perplexity_reward

    return total_reward

def compute_fluency(sequence):
    """ Compute the fluency of the generated sequence.
    :param sequence: The generated text sequence.
    :return: A numerical value representing the fluency.
    """
    # Tokenize the sequence
    inputs = tokenizer(sequence, return_tensors="pt")

    # Generate output from the model
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])

    # The loss represents the fluency: lower loss means more fluent
    fluency = -outputs.loss.item()

    return fluency

def diversity(text):
    words = text.split()
    return len(set(words)) / len(words)

def compute_embeddings(text):
    """
    Compute the embeddings for the given text using the pre-trained model.
    
    :param text: A string or a list of strings to encode.
    :return: A tensor of embeddings.
    """
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # If outputs is a tuple, the embeddings might be the first element
    if isinstance(outputs, tuple):
        embeddings = outputs[0]
    else:
        embeddings = outputs.logits  # or outputs.hidden_states depending on the model
    return embeddings.mean(dim=1)


def compute_relevance(sequence, target_context):
    """
    Compute the relevance of the sequence to the target context using semantic similarity.

    :param sequence: The generated text sequence.
    :param target_context: The target context or topic for comparison.
    :return: A numerical value representing the relevance.
    """
    # Generate embeddings
    sequence_embedding = compute_embeddings(sequence)
    context_embedding = compute_embeddings(target_context)

    # Compute cosine similarity
    cosine_scores = torch.nn.functional.cosine_similarity(sequence_embedding, context_embedding)

    # Extract the similarity score as a float
    similarity_score = cosine_scores.item()

    return similarity_score

# # Example usage
# sequence = "Advancements in AI technology have revolutionized many industries."
# target_context = "technology and innovation"
# relevance_score = compute_relevance(sequence, target_context)
# print("Relevance Score:", relevance_score)

class OrcaDQN:
    def __init__(self, model, dqn_agent, tokenizer):
        self.model = model
        self.dqn_agent = dqn_agent
        self.tokenizer = tokenizer

    def generate_sequence(self, prompt):
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
        generated_sequence = []
        while not self.end_condition_met(input_ids):
            action = self.dqn_agent.select_action(input_ids)
            input_ids, reward = environment_step(self.model, input_ids, action)
            self.dqn_agent.update(input_ids, action, reward, input_ids)
            generated_sequence.append(action)
        return self.tokenizer.decode(generated_sequence)

    def end_condition_met(self, input_ids, max_length=50):
        # Stop if the EOS token is generated or max length is reached
        return (input_ids[0][-1] == tokenizer.eos_token_id) or (input_ids.size(1) > max_length)

from tqdm.auto import tqdm

def train(orca_dqn, dqn_agent, num_episodes, target_context):
    for episode in tqdm(range(num_episodes), desc="Training Episodes"):
        input_ids = tokenizer.encode(target_context, return_tensors='pt')
        # input_ids = input_ids.view(512, 128)

        total_reward = 0
        done = False
        while not done:

            action = dqn_agent.select_action(input_ids)
            next_input_ids, reward, done = environment_step(orca_dqn.model, input_ids, action)
            dqn_agent.update(input_ids, action, reward, next_input_ids, done)

            input_ids = next_input_ids
            total_reward += reward

            print(f"Action: {action}, Reward: {reward}, Total Reward: {total_reward}")

            if done:
                print(f"Episode {episode + 1} Complete. Total Reward: {total_reward}")
                print(f"Generated Sequence: {orca_dqn.generate_sequence(target_context)}\n")

def pad_or_truncate(sequence, max_length=512, pad_token_id=0):
    # Ensure the sequence is 1D and pad or truncate to max_length
    sequence = sequence.view(-1)
    sequence_length = sequence.size(0)
    if sequence_length > max_length:
        return sequence[:max_length].unsqueeze(0)
    elif sequence_length < max_length:
        padding = torch.full((max_length - sequence_length,), pad_token_id, dtype=sequence.dtype)
        return torch.cat((sequence, padding), dim=0).unsqueeze(0)
    else:
        return sequence.unsqueeze(0)


# Initialize the DQN Agent
state_size = 512  # This should match your model's input size
action_size = tokenizer.vocab_size  # Total number of possible actions (tokens)
hidden_size = 128  # This can be adjusted
learning_rate = 0.001
gamma = 0.99
target_context = "Your target context here" 


dqn_agent = DQNAgent(state_size, action_size, hidden_size, learning_rate, gamma)

# Create the OrcaDQN instance
orca_dqn = OrcaDQN(model, dqn_agent, tokenizer)

 # Define this if you are using it in compute_reward
train(orca_dqn, dqn_agent, num_episodes=100, target_context=target_context)



Training Episodes:   0%|          | 0/100 [00:00<?, ?it/s]

Action: 3094, Reward: -200556.53692054749, Total Reward: -200556.53692054749
Action: 22733, Reward: -340540.7070055008, Total Reward: -541097.2439260483
Action: 12931, Reward: -314730.5969352722, Total Reward: -855827.8408613205


In [1]:
import torch

# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    print("CUDA (GPU support) is available in PyTorch!")
    print("Number of GPU devices available:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA (GPU support) is not available in PyTorch.")


CUDA (GPU support) is not available in PyTorch.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

class DQNNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class DQNAgent:
    def __init__(self, state_size, action_size, hidden_size, learning_rate, gamma):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = DQNNetwork(state_size, action_size, hidden_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def select_action(self, state):
        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            with torch.no_grad():
                state = torch.FloatTensor(state).unsqueeze(0)
                q_values = self.model(state)
            return np.argmax(q_values.cpu().detach().numpy())

    def update(self, state, action, reward, next_state, done):
        state = torch.FloatTensor(state).unsqueeze(0)
        next_state = torch.FloatTensor(next_state).unsqueeze(0)
        reward = torch.tensor(reward)
        done = torch.tensor(done)

        q_update = reward + (self.gamma * torch.max(self.model(next_state)) * (1 - done))
        q_values = self.model(state)
        q_values[0][action] = q_update

        loss = F.mse_loss(q_values, self.model(state))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

def environment_step(model, input_ids, action):
    # Append the action (token) to the input sequence
    input_ids = torch.cat((input_ids, torch.tensor([[action]])), dim=1)

    # Generate output from the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
    logits = outputs.logits

    # Get the next token (you might want a different approach here)
    next_token_id = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)

    # Define a reward mechanism (this is highly task-specific)
    reward = compute_reward(input_ids, next_token_id)

    # Check if the end of sequence token is generated
    done = next_token_id.item() == tokenizer.eos_token_id

    return input_ids, reward, done

def compute_reward(input_ids, next_token_id):
    # Implement reward calculation
    pass



In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/Orca-2-13b")
model = AutoModelForCausalLM.from_pretrained("microsoft/Orca-2-13b")


In [None]:
class OrcaDQN:
    def __init__(self, model, dqn_agent, tokenizer):
        self.model = model
        self.dqn_agent = dqn_agent
        self.tokenizer = tokenizer

    def generate_sequence(self, prompt):
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
        generated_sequence = []
        while not self.end_condition_met(input_ids):
            action = self.dqn_agent.select_action(input_ids)
            input_ids, reward = environment_step(self.model, input_ids, action)
            self.dqn_agent.update(input_ids, action, reward, input_ids)
            generated_sequence.append(action)
        return self.tokenizer.decode(generated_sequence)

    def end_condition_met(self, input_ids):
        # Define the end condition for sequence generation
        pass


In [None]:
def train(model, agent, num_episodes):
    for episode in range(num_episodes):
        # Start with an initial prompt
        prompt = "The quick brown fox "
        
        # Generate a sequence
        sequence = model.generate_sequence(prompt)
        
        # Evaluate the sequence and update the model (this part is highly task-dependent)
        # ...

# Initialize the DQN Agent
dqn_agent = DQNAgent()

# Create the OrcaDQN instance
orca_dqn = OrcaDQN(model, dqn_agent, tokenizer)

# Train the model
train(orca_dqn, dqn_agent, num_episodes=100)
