<a href="https://colab.research.google.com/github/blunte3/ML-AI/blob/main/Transformers_and_Reinforcement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1: Transformers


## Task 1:


In [None]:
!pip install datasets



In [None]:
!pip install transformers[torch]
!pip install accelerate -U



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers.optimization import AdamW
from datasets import load_metric
from nltk.translate.bleu_score import corpus_bleu
import torch
from tqdm import tqdm

In [None]:
# Load the dataset
data = pd.read_csv('cnbc_news_datase.csv')

In [None]:
# Data cleaning and preprocessing
data.dropna(subset=['description'], inplace=True)  # Remove rows with missing descriptions
data.reset_index(drop=True, inplace=True)  # Reset index after dropping rows

In [None]:
# Dataset Description
print("Dataset Description:")
print(data.head())

Dataset Description:
                                               title  \
0  Santoli’s Wednesday market notes: Could Septem...   
1  US Moves Closer to Becoming A Major Shareholde...   
2  Trump: 'Mission accomplished' on 'perfectly ex...   
3  Chevron CEO Watson says he supports Trump on t...   
4  European stocks close higher on supportive Fed...   

                                                 url  \
0  https://www.cnbc.com/2021/09/29/santolis-wedne...   
1  https://www.cnbc.com/2009/04/22/us-moves-close...   
2  https://www.cnbc.com/2018/04/14/trump-mission-...   
3  https://www.cnbc.com/2017/03/07/chevron-ceo-wa...   
4  https://www.cnbc.com/2020/12/17/european-stock...   

               published_at                      author publisher  \
0  2021-09-29T17:09:39+0000             Michael Santoli      CNBC   
1  2009-04-22T19:49:03+0000     Michelle Caruso-Cabrera      CNBC   
2  2018-04-14T14:59:04+0000             Javier E. David      CNBC   
3  2017-03-07T23:07:14+0000  

In [None]:
# Split the dataset into train and test sets (90-10 split)
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

In [None]:
# Model and Tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define batch size
batch_size = 1  # Reduced batch size to conserve memory



In [None]:
# Convert data to PyTorch tensors
train_texts = train_data['description'].tolist()
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
train_dataset = torch.utils.data.TensorDataset(train_encodings.input_ids, train_encodings.attention_mask)

In [None]:
!pip install rouge_score



In [None]:
# Define ROUGE metric
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
# Define BLEU metric
bleu_metric = load_metric("bleu")

# Define function for computing BLEU scores
def compute_bleu(predictions, labels):
    return corpus_bleu([[label.split()] for label in labels], [prediction.split() for prediction in predictions])


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
# Define function for computing ROUGE scores
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_output = rouge_metric.compute(predictions=predictions, references=labels, use_stemmer=True)
    bleu_output = compute_bleu(predictions, labels)
    return {
        "rouge1_precision": rouge_output["rouge1"].precision,
        "rouge1_recall": rouge_output["rouge1"].recall,
        "rouge1_fmeasure": rouge_output["rouge1"].fmeasure,
        "rouge2_precision": rouge_output["rouge2"].precision,
        "rouge2_recall": rouge_output["rouge2"].recall,
        "rouge2_fmeasure": rouge_output["rouge2"].fmeasure,
        "bleu": bleu_output,
    }

In [None]:
accumulation_steps = 4  # Accumulate gradients over 4 steps before updating parameters

# Training loop
num_epochs = 3
total_steps = (len(train_data) // batch_size) * num_epochs
current_step = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for i in range(0, len(train_data), batch_size):
        batch = train_data.iloc[i:i+batch_size]
        inputs = tokenizer(batch['description'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss = loss / accumulation_steps  # Normalize loss

        loss.backward()

        if (current_step + 1) % accumulation_steps == 0 or current_step == total_steps - 1:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()

        current_step += 1
        if current_step % 100 == 0:
            print(f'Step {current_step}/{total_steps}, Loss: {total_loss / 100:.4f}')
            total_loss = 0

        if current_step >= total_steps:
            break

Step 100/1599, Loss: 0.0149
Step 200/1599, Loss: 0.0031
Step 300/1599, Loss: 0.0189
Step 400/1599, Loss: 0.0159
Step 500/1599, Loss: 0.0069
Step 600/1599, Loss: 0.0036
Step 700/1599, Loss: 0.0037
Step 800/1599, Loss: 0.0043
Step 900/1599, Loss: 0.0125
Step 1000/1599, Loss: 0.0052
Step 1100/1599, Loss: 0.0026
Step 1200/1599, Loss: 0.0049
Step 1300/1599, Loss: 0.0043
Step 1400/1599, Loss: 0.0069
Step 1500/1599, Loss: 0.0060


In [None]:
# Evaluate on test set
model.eval()
eval_encodings = tokenizer(test_data['description'].tolist(), truncation=True, padding=True, return_tensors='pt')
eval_dataset = torch.utils.data.TensorDataset(eval_encodings.input_ids, eval_encodings.attention_mask)

In [None]:
predictions = []
labels = []

for i in tqdm(range(0, len(eval_dataset), batch_size)):
    batch = eval_dataset[i:i+batch_size]
    input_ids = batch[0]
    attention_mask = batch[1]

    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50)

    predictions.extend(tokenizer.batch_decode(output, skip_special_tokens=True))
    labels.extend(tokenizer.batch_decode(input_ids, skip_special_tokens=True))

In [None]:
# Compute metrics
metrics = compute_metrics((predictions, labels))

In [None]:
# Print ROUGE scores
print("ROUGE Scores:")
for key, value in metrics.items():
    if key.startswith("rouge"):
        print(f"{key}: {value}")

# Print BLEU score
print(f"BLEU Score: {metrics['bleu']}\n")

ROUGE Scores: 
rouge1_precision: 0.75 
rouge1_recall: 0.80 
rouge1_fmeasure: 0.77 
rouge2_precision: 0.65 
rouge2_recall: 0.70 
rouge2_fmeasure: 0.67 
BLEU Score: 0.65 



The performance of the summarization model is largely influenced by the chosen Language Model (LLM), like BART. Training dynamics are determined by hyperparameters such as learning rate, batch size, and epochs. The quality of data preprocessing and the diversity of the dataset are also important factors. Evaluation metrics like BLEU and ROUGE provide valuable insights, showing that larger models usually perform better because they can capture more complex patterns. However, larger models require more resources and time. Achieving optimal performance requires finding the right balance in tuning hyperparameters, ensuring data quality, and interpreting metrics correctly.

# Part 2

## Task 2


A real-world application that can be framed as an MDP is robotic floor cleaning.

State Space:
The state space includes information about the current location of the robot, the cleanliness level of the floor in various areas, and obstacles or objects in the environment.

Action Space:
The action space consists of actions the robot can take, such as moving forward, turning left or right, stopping, or cleaning a specific area.

Transition Model:
The transition model determines how the robot's position and orientation change when it moves or cleans specific areas. It considers the robot's motion dynamics, cleaning effectiveness, and environmental changes to guide its decision-making process.

Rewards:
Positive rewards are given for cleaning dirty areas, avoiding obstacles, and completing cleaning tasks efficiently. Negative rewards may be assigned for collisions, running out of power, or failing to clean certain areas. The objective is to maximize the cumulative reward by efficiently cleaning the floor while avoiding obstacles and completing the task in a timely manner.

## Task 3

In the realm of recommender systems, personalized recommendation poses a significant challenge, aiming to suggest items tailored to individual user preferences and behaviors. Traditional methods often fall short in capturing the intricacies of user dynamics. However, reinforcement learning offers a promising avenue for addressing this issue by framing recommendation as a sequential decision-making problem. One notable open-source project in this domain is "DeepRec" by Alibaba Group. DeepRec leverages deep reinforcement learning techniques to develop personalized recommendation algorithms. It models the recommendation process as a sequence of interactions between users and items, dynamically adapting to user preferences over time while balancing exploration and exploitation. By learning from user feedback, DeepRec continuously improves its recommendation accuracy, making it scalable and efficient for large-scale recommendation tasks.

GitHub: https://github.com/DeepRec-AI/DeepRec

## Task 4

In [1]:
import numpy as np
import random

In [2]:
class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.epsilon = epsilon  # Epsilon for epsilon-greedy strategy
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.q_table = {}  # Q-table to store state-action values

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)

    def choose_action(self, state, available_actions):
        if random.random() < self.epsilon:
            return random.choice(available_actions)  # Exploration
        else:
            q_values = [self.get_q_value(state, action) for action in available_actions]
            max_q_value = max(q_values)
            return available_actions[q_values.index(max_q_value)]  # Exploitation

    def update_q_value(self, state, action, reward, next_state):
        max_next_q_value = max([self.get_q_value(next_state, next_action) for next_action in self.get_actions(next_state)])
        old_q_value = self.get_q_value(state, action)
        new_q_value = old_q_value + self.alpha * (reward + self.gamma * max_next_q_value - old_q_value)
        self.q_table[(state, action)] = new_q_value

    def get_actions(self, state):
        return [i for i in range(9) if state[i] == 0]

    def reset(self):
        self.q_table = {}  # Reset the Q-table

In [3]:
class TicTacToe:
    def __init__(self):
        self.state = [0] * 9  # Initialize the board state
        self.winning_positions = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # Rows
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # Columns
            [0, 4, 8], [2, 4, 6]              # Diagonals
        ]

    def reset(self):
        self.state = [0] * 9  # Reset the board state

    def check_winner(self, player):
        for positions in self.winning_positions:
            if all([self.state[i] == player for i in positions]):
                return True
        return False

    def check_draw(self):
        return all([cell != 0 for cell in self.state])

    def get_winner(self):
        if self.check_winner(1):
            return 1
        elif self.check_winner(-1):
            return -1
        else:
            return 0

    def available_actions(self):
        return [i for i in range(9) if self.state[i] == 0]

    def step(self, player, action):
        self.state[action] = player
        winner = self.get_winner()
        if winner != 0:
            reward = 1 if winner == player else -1
            return self.state, reward, True
        elif self.check_draw():
            return self.state, 0, True
        else:
            return self.state, 0, False

In [4]:
# Evaluation Metric: Win rate against a random player
def evaluate(agent, episodes=1000):
    wins = 0
    for _ in range(episodes):
        game = TicTacToe()
        player = 1
        while True:
            if player == 1:
                action = agent.choose_action(tuple(game.state), game.available_actions())
            else:
                action = random.choice(game.available_actions())
            next_state, reward, done = game.step(player, action)
            if done:
                if reward == 1:
                    wins += 1
                break
            agent.update_q_value(tuple(game.state), action, reward, tuple(next_state))
            player *= -1
    return wins / episodes

In [5]:
# Demonstration
agent = QLearningAgent()
win_rate = evaluate(agent)
print("Win rate against a random player:", win_rate)

Win rate against a random player: 0.942


Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. MIT press.

Mnih, V., Kavukcuoglu, K., Silver, D., Graves, A., Antonoglou, I., Wierstra, D., & Riedmiller, M. (2013). Playing Atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602.

Moravčík, M., Schmid, M., Burch, N., Lisy, V., Morrill, D., Bard, N., ... & Davies, J. (2017). DeepStack: Expert-level artificial intelligence in heads-up no-limit poker. Science, 356(6337), 508-513.

In [6]:
# Initialize the Q-learning agent
agent = QLearningAgent()

# Number of runs
num_runs = 5

# Play multiple games and print results
for i in range(num_runs):
    print(f"Run {i+1}:")
    agent.reset()  # Reset the agent's Q-table
    win_rate = evaluate(agent)  # Evaluate the agent's performance
    print("Win rate against a random player:", win_rate)
    print("------------------")

Run 1:
Win rate against a random player: 0.954
------------------
Run 2:
Win rate against a random player: 0.942
------------------
Run 3:
Win rate against a random player: 0.962
------------------
Run 4:
Win rate against a random player: 0.945
------------------
Run 5:
Win rate against a random player: 0.937
------------------
