In [None]:
%pip install transformers torch

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name_or_path = "fblgit/juanako-7b-UNA"
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    low_cpu_mem_usage=True,
    # device_map="cuda:0"
)
# Create the tokenizer from the model object
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# print(llm("AI is going to"))
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

from transformers import BertTokenizer, BertModel

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

Loading checkpoint shards: 100%|██████████| 3/3 [00:39<00:00, 13.02s/it]
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 56.0kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.07MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 4.38MB/s]
config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 
model.safetensors: 100%|██████████| 440M/440M [00:15<00:00, 28.9MB/s] 


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np

torch.set_default_tensor_type('torch.FloatTensor')

class DQNNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        # print("Network forward: Input shape:", x.shape)  # Should be [1, 512]
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class DQNAgent:
    def __init__(self, state_size, action_size, hidden_size, learning_rate, gamma):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = DQNNetwork(state_size, action_size, hidden_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def select_action(self, state):
        # print("select_action: Original shape of state:", state.shape)
        state = state.float()
        state_tensor = pad_or_truncate(state, max_length=512)
        # # If state is a 2D tensor of shape [1, 512], no need to average across the last dimension
        # if state.ndim == 2 and state.shape[1] == 512:
        #     state_tensor = state
        # elif state.ndim == 2:
        #     # If state is a 2D tensor but not of the correct size, pad or truncate
        #     state_tensor = pad_or_truncate(state, max_length=512)
        # else:
        #     # If state is not a 2D tensor, compute the embeddings
        #     state_tensor = pad_or_truncate(state, max_length=512)
        state_tensor = state_tensor.unsqueeze(0)  # Ensure it has a batch dimension
        # print("select_action: Shape of state_tensor before passing to the network:", state_tensor.shape)

        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            with torch.no_grad():
                q_values = self.model(state_tensor)
            return np.argmax(q_values.cpu().detach().numpy())


    def update(self, state, action, reward, next_state, done):
        # Ensure state and next_state are float tensors
        state = pad_or_truncate(state, max_length=512)
        state = state.float()
        next_state = next_state.float()

        # Reshape next_state if not in correct shape
        if next_state.ndim == 2 and next_state.shape[1] != 512:
            # Handle reshaping here, maybe pad or truncate
            next_state = pad_or_truncate(next_state, max_length=512)

        # Ensure the tensors are of the correct shape
        # print("Update: Shape of next_state_tensor:", next_state.shape)
        
        reward = torch.tensor(reward, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        q_update = reward + (self.gamma * torch.max(self.model(next_state)) * (1 - done))
        q_values = self.model(state)
        q_values[0][action] = q_update

        loss = F.mse_loss(q_values, self.model(state))

        self.optimizer.zero_grad()
        loss.backward()
            # Gradient Clipping
        for param in self.model.parameters():
            param.grad.data.clamp_(-1, 1)

        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


def environment_step(model, input_ids, action):
    # Append the action (token) to the input sequence
    input_ids = torch.cat((input_ids, torch.tensor([[action]])), dim=1)

    # Generate output from the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
    logits = outputs.logits

    # Get the next token (you might want a different approach here)
    next_token_id = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)

    # Define a reward mechanism (this is highly task-specific)
    reward = compute_reward(input_ids, next_token_id)

    # Check if the end of sequence token is generated
    done = next_token_id.item() == tokenizer.eos_token_id

    return input_ids, reward, done

def compute_perplexity(sequence):
    """ Compute the perplexity of the generated sequence.
    :param sequence: The generated text sequence.
    :return: A numerical value representing the perplexity.
    """
    # Tokenize the sequence
    inputs = tokenizer(sequence, return_tensors="pt")

    # Generate output from the model
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])

    # Perplexity is e^loss
    perplexity = torch.exp(outputs.loss)
    # Normalize perplexity to a smaller range, e.g., [0, 1]
    return 1 / (1 + np.log(perplexity.item() + 1))

def compute_reward(input_ids, next_token_id, target_context=None):
    """ Compute the reward for the generated sequence.
    :param input_ids: Tensor of input token IDs.
    :param next_token_id: Tensor of the next token ID generated.
    :param target_context: Optional context or topic that the generated text should align with.
    :return: A numerical reward value. """

    # Convert token IDs to text
    input_ids = input_ids.flatten()
    # Check if token_id is a tensor and if so, convert it to a Python integer
    next_token_id = next_token_id.item() if isinstance(next_token_id, torch.Tensor) else next_token_id
    sequence = tokenizer.decode(input_ids.tolist() + [next_token_id])

    # Fluency reward
    fluency_reward = compute_fluency(sequence)

    # Relevance reward (if target_context is provided)
    relevance_reward = 0
    if target_context:
        relevance_reward = compute_relevance(sequence, target_context)

    # Diversity reward
    diversity_reward = diversity(sequence)

    # Perplexity reward
    perplexity_reward = -compute_perplexity(sequence)  # We negate it because lower perplexity is better

    # Aggregate reward
    # Normalize rewards
    fluency_reward = normalize_reward(compute_fluency(sequence))
    relevance_reward = normalize_reward(compute_relevance(sequence, target_context)) if target_context else 0
    diversity_reward = normalize_reward(diversity(sequence))
    perplexity_reward = normalize_reward(-compute_perplexity(sequence))

    # Weighted sum of rewards
    weights = {'fluency': 0.4, 'relevance': 0.3, 'diversity': 0.2, 'perplexity': 0.1}
    total_reward = (weights['fluency'] * fluency_reward + 
                    weights['relevance'] * relevance_reward +
                    weights['diversity'] * diversity_reward + 
                    weights['perplexity'] * perplexity_reward)

    return total_reward

def normalize_reward(reward, min_reward=-1, max_reward=1):
    # Normalize reward to be within [min_reward, max_reward]
    return (reward - min_reward) / (max_reward - min_reward)



def compute_fluency(sequence):
    """
    Compute a basic fluency score based on sentence length and structure.
    """
    sentences = sent_tokenize(sequence)
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    num_sentences = len(sentences)
    
    # Average words per sentence - expecting a range around typical English sentence length (15-20 words)
    avg_sentence_length = total_words / num_sentences if num_sentences > 0 else 0
    
    # Basic fluency score calculation
    if 15 <= avg_sentence_length <= 20:
        fluency_score = 1  # Ideal range
    elif 10 <= avg_sentence_length < 15 or 20 < avg_sentence_length <= 25:
        fluency_score = 0.75  # Acceptable range
    else:
        fluency_score = 0.5  # Less fluent

    return fluency_score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_repetition_penalty(sequence):
    """
    Compute a penalty for repetitive sequences.
    """
    sentences = sent_tokenize(sequence)
    sentence_set = set(sentences)
    num_repeated_sentences = len(sentences) - len(sentence_set)
    
    # Penalty based on the proportion of repeated sentences
    return num_repeated_sentences / len(sentences) if len(sentences) > 0 else 0


def lexical_diversity(text):
    """
    Compute lexical diversity as the ratio of unique words to total words.
    """
    words = word_tokenize(text)
    unique_words = set(words)
    return len(unique_words) / len(words) if len(words) > 0 else 0

def diversity(sequence):
    """
    Compute the diversity of the generated sequence.
    """
    return lexical_diversity(sequence)


def compute_embeddings(text):
    """
    Compute BERT embeddings for the given text.
    """
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  # Average pooling

def compute_relevance(sequence, target_context):
    """
    Compute semantic similarity (relevance) between sequence and target context.
    """
    sequence_embedding = compute_embeddings(sequence)
    context_embedding = compute_embeddings(target_context)
    cosine_similarity = torch.nn.functional.cosine_similarity(sequence_embedding, context_embedding)

    return cosine_similarity.item()

# # Example usage
# sequence = "Advancements in AI technology have revolutionized many industries."
# target_context = "technology and innovation"
# relevance_score = compute_relevance(sequence, target_context)
# print("Relevance Score:", relevance_score)

class OrcaDQN:
    def __init__(self, model, dqn_agent, tokenizer):
        self.model = model
        self.dqn_agent = dqn_agent
        self.tokenizer = tokenizer

    def generate_sequence(self, prompt):
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
        generated_sequence = []
        while not self.end_condition_met(input_ids):
            action = self.dqn_agent.select_action(input_ids)
            input_ids, reward = environment_step(self.model, input_ids, action)
            self.dqn_agent.update(input_ids, action, reward, input_ids)
            generated_sequence.append(action)
        return self.tokenizer.decode(generated_sequence)

    def end_condition_met(self, input_ids, max_length=50):
        # Stop if the EOS token is generated or max length is reached
        return (input_ids[0][-1] == tokenizer.eos_token_id) or (input_ids.size(1) > max_length)

from tqdm.auto import tqdm

def train(orca_dqn, dqn_agent, num_episodes, target_context):
    for episode in tqdm(range(num_episodes), desc="Training Episodes"):
        input_ids = tokenizer.encode(target_context, return_tensors='pt')
        # input_ids = input_ids.view(512, 128)

        total_reward = 0
        done = False
        while not done:

            action = dqn_agent.select_action(input_ids)
            next_input_ids, reward, done = environment_step(orca_dqn.model, input_ids, action)
            dqn_agent.update(input_ids, action, reward, next_input_ids, done)

            input_ids = next_input_ids
            total_reward += reward

            print(f"Action: {action}, Reward: {reward}, Total Reward: {total_reward}")

            if done:
                print(f"Episode {episode + 1} Complete. Total Reward: {total_reward}")
                print(f"Generated Sequence: {orca_dqn.generate_sequence(target_context)}\n")

def pad_or_truncate(sequence, max_length=512, pad_token_id=0):
    # Ensure the sequence is 1D and pad or truncate to max_length
    sequence = sequence.view(-1)
    sequence_length = sequence.size(0)
    if sequence_length > max_length:
        return sequence[:max_length].unsqueeze(0)
    elif sequence_length < max_length:
        padding = torch.full((max_length - sequence_length,), pad_token_id, dtype=sequence.dtype)
        return torch.cat((sequence, padding), dim=0).unsqueeze(0)
    else:
        return sequence.unsqueeze(0)

# Initialize the DQN Agent
state_size = 512  # This should match your model's input size
action_size = tokenizer.vocab_size  # Total number of possible actions (tokens)
hidden_size = 128  # This can be adjusted
learning_rate = 0.001
gamma = 0.99
target_context = "Your target context here" 

dqn_agent = DQNAgent(state_size, action_size, hidden_size, learning_rate, gamma)

# Create the OrcaDQN instance
orca_dqn = OrcaDQN(model, dqn_agent, tokenizer)

 # Define this if you are using it in compute_reward
train(orca_dqn, dqn_agent, num_episodes=100, target_context=target_context)



Training Episodes:   0%|          | 0/100 [00:00<?, ?it/s]

Action: 9368, Reward: 0.546033063525356, Total Reward: 0.546033063525356
Action: 18911, Reward: 0.5460111141347814, Total Reward: 1.0920441776601373
Action: 17062, Reward: 0.5461471952474567, Total Reward: 1.638191372907594
Action: 28018, Reward: 0.5461201148335372, Total Reward: 2.184311487741131
Action: 1723, Reward: 0.5461813703082143, Total Reward: 2.7304928580493453
Action: 19920, Reward: 0.5961829883210834, Total Reward: 3.3266758463704287
Action: 23681, Reward: 0.5962133556105194, Total Reward: 3.922889201980948
Action: 31967, Reward: 0.5963372673356294, Total Reward: 4.519226469316577
Action: 14225, Reward: 0.5963166082484389, Total Reward: 5.115543077565016
Action: 27586, Reward: 0.5963520673443402, Total Reward: 5.711895144909357
Action: 8327, Reward: 0.5963219674565747, Total Reward: 6.308217112365932
Action: 10419, Reward: 0.5963333948154226, Total Reward: 6.904550507181354
Action: 21148, Reward: 0.6463273319387088, Total Reward: 7.550877839120063
Action: 10023, Reward: 0.6

In [None]:
import torch

# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    print("CUDA (GPU support) is available in PyTorch!")
    print("Number of GPU devices available:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA (GPU support) is not available in PyTorch.")


CUDA (GPU support) is not available in PyTorch.
