# Training

## Load data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [57]:
import json

root_path = "/content/drive/MyDrive/CanadianTireChatbot/data"
train_path = f"{root_path}/processed_train_fixed.json"
#train_path = f"processed_train_fixed.json"
test_path = f"{root_path}/processed_test.json"

# Load the JSON file
with open(train_path, 'r') as f:
    dataset = json.load(f)

In [58]:
# load test JSON file
with open(test_path, 'r') as f:
    test_dataset = json.load(f)

In [6]:
# Inspect the data
print(dataset[:2])  # Print the first two entries

[{'context': 'Consolidated revenue was $4,192.9 million, down 1.4% compared to the same period last year. Revenue excluding Petroleum was $3,639.8 million, a decrease of 0.4%.', 'question': 'What was the consolidated revenue for Canadian Tire in Q3 2024?', 'answers': [{'text': '$4,192.9 million', 'answer_start': 25, 'answer_end': 41}]}, {'context': 'Consolidated revenue was $4,192.9 million, down 1.4% compared to the same period last year. Revenue excluding Petroleum was $3,639.8 million, a decrease of 0.4%.', 'question': 'How much was the revenue excluding Petroleum in Q3 2024?', 'answers': [{'text': '$3,639.8 million', 'answer_start': 124, 'answer_end': 140}]}]


## Set retriever
As the user only makes a question we need to implement a retrieval to get the most appropiate context from our database:

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Combine all contexts from the dataset
contexts = [entry['context'] for entry in dataset]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
context_matrix = vectorizer.fit_transform(contexts)

def retrieve_context(question, vectorizer, context_matrix, contexts, top_n=1):
    # Transform the question into the same vector space
    question_vec = vectorizer.transform([question])

    # Compute cosine similarity between the question and all contexts
    similarities = (context_matrix @ question_vec.T).toarray().flatten()

    # Get the most relevant context(s)
    best_indices = np.argsort(similarities)[-top_n:][::-1]
    return [contexts[i] for i in best_indices]


In [None]:
len(dataset)

42

# Tokenizer function

In [8]:
def preprocess_qa_data(dataset, tokenizer, max_length=512):
    tokenized_data = []

    for entry in dataset:
        context = entry['context']
        question = entry['question']
        answer = entry['answers'][0]['text']
        answer_start = entry['answers'][0]['answer_start']
        answer_end = entry['answers'][0]['answer_end']

        # Verify that the answer is within the context
        if context[answer_start:answer_end] != answer:
            print(f"Skipping sample: Answer '{answer}' not found in the provided context")
            continue

        # Tokenize the context and question
        inputs = tokenizer(
            question,
            context,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize the context separately to map char to token index
        tokenized_context = tokenizer(context, return_offsets_mapping=True, return_tensors='pt')
        offsets = tokenized_context['offset_mapping'].squeeze(0)

        # Map char positions to token positions
        start_token_idx, end_token_idx = None, None
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                start_token_idx = idx
            if start < answer_end <= end:
                end_token_idx = idx

        if start_token_idx is None or end_token_idx is None:
            print(f"Skipping sample due to token alignment issues: {answer}")
            continue

        # Prepare inputs and labels
        tokenized_data.append({
            "input_ids": inputs['input_ids'].squeeze(0),
            "attention_mask": inputs['attention_mask'].squeeze(0),
            "start_positions": torch.tensor(start_token_idx, dtype=torch.long),
            "end_positions": torch.tensor(end_token_idx, dtype=torch.long)
        })

    return tokenized_data


## Create dataset and dataloader

In [9]:
from torch.utils.data import Dataset, DataLoader

class QADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            "input_ids": item["input_ids"],
            "attention_mask": item["attention_mask"],
            "start_positions": item["start_positions"],
            "end_positions": item["end_positions"]
        }



# Evaluation
Set evaluation Functions

###ROUGE (Recall-Oriented Understudy for Gisting Evaluation)
ROUGE measures the overlap between the predicted and reference answers in terms of n-grams, word sequences, and longest common subsequence.

In [10]:
pip install rouge-score nltk bert-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=8a9944215115577b75c7da10ef2afbfaffda28253778fd8825a913081d9ee780
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


In [11]:
from rouge_score import rouge_scorer

def compute_rouge(prediction, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, prediction)
    return scores


### METEOR (Metric for Evaluation of Translation with Explicit ORdering)
METEOR is based on harmonic mean precision and recall, with an emphasis on word order and synonyms.

In [79]:
from nltk.translate.meteor_score import meteor_score

from nltk.tokenize import word_tokenize

def compute_meteor(prediction, reference):
    # Tokenize prediction and reference
    prediction_tokens = word_tokenize(prediction)
    reference_tokens = word_tokenize(reference)

    return meteor_score([reference_tokens], prediction_tokens) # Pass tokenized inputs


### BERTScore
BERTScore computes similarity using contextual embeddings from pre-trained transformers like BERT. It aligns tokens in the predicted and reference answers and calculates precision, recall, and F1.

In [13]:
from bert_score import score

def compute_bertscore(prediction, reference, model_type="bert-base-uncased"):
    P, R, F1 = score([prediction], [reference], model_type=model_type, verbose=True)
    return {"Precision": P.item(), "Recall": R.item(), "F1": F1.item()}


### Evaluate the Metrics Across a Dataset

Create a function to compute all metrics for multiple predictions and references.

In [14]:
def evaluate_metrics(predictions, references):
    results = {
        "ROUGE-1": [],
        "ROUGE-2": [],
        "ROUGE-L": [],
        "METEOR": [],
        "BERT-Precision": [],
        "BERT-Recall": [],
        "BERT-F1": []
    }

    for pred, ref in zip(predictions, references):
        # Compute ROUGE
        rouge_scores = compute_rouge(pred, ref)
        results["ROUGE-1"].append(rouge_scores["rouge1"].fmeasure)
        results["ROUGE-2"].append(rouge_scores["rouge2"].fmeasure)
        results["ROUGE-L"].append(rouge_scores["rougeL"].fmeasure)

        # Compute METEOR
        results["METEOR"].append(compute_meteor(pred, ref))

        # Compute BERTScore
        bert_scores = compute_bertscore(pred, ref)
        results["BERT-Precision"].append(bert_scores["Precision"])
        results["BERT-Recall"].append(bert_scores["Recall"])
        results["BERT-F1"].append(bert_scores["F1"])

    # Aggregate results
    aggregated_results = {metric: sum(scores) / len(scores) for metric, scores in results.items()}
    return aggregated_results


# BERT SQUAD

## Use tokenizer

Tokenize dataset

In [115]:
from transformers import BertTokenizerFast
import torch

# Load the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [101]:
# Preprocess the dataset
tokenized_data = preprocess_qa_data(dataset, tokenizer)
print(f"Processed {len(tokenized_data)} samples.")

Processed 42 samples.


In [102]:
# Create Dataset and DataLoader
qa_dataset = QADataset(tokenized_data)
train_dataloader = DataLoader(qa_dataset, batch_size=16, shuffle=True)

## Fine-tune the model

In [104]:
from transformers import BertForQuestionAnswering, AdamW

# Load pre-trained model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

epochs = 100
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        # Compute loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader):.4f}")

model.safetensors:  35%|###5      | 472M/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, Loss: 10.4707
Epoch 2, Loss: 5.5491
Epoch 3, Loss: 4.2147
Epoch 4, Loss: 3.9101
Epoch 5, Loss: 3.8696
Epoch 6, Loss: 3.7496
Epoch 7, Loss: 3.6599
Epoch 8, Loss: 3.5497
Epoch 9, Loss: 3.4541
Epoch 10, Loss: 3.2578
Epoch 11, Loss: 3.0865
Epoch 12, Loss: 2.8915
Epoch 13, Loss: 2.5433
Epoch 14, Loss: 2.2872
Epoch 15, Loss: 2.1168
Epoch 16, Loss: 1.8471
Epoch 17, Loss: 1.4920
Epoch 18, Loss: 1.1297
Epoch 19, Loss: 0.8954
Epoch 20, Loss: 0.6250
Epoch 21, Loss: 0.5569
Epoch 22, Loss: 0.4227
Epoch 23, Loss: 0.3885
Epoch 24, Loss: 0.3605
Epoch 25, Loss: 0.1999
Epoch 26, Loss: 0.2163
Epoch 27, Loss: 0.2354
Epoch 28, Loss: 0.1066
Epoch 29, Loss: 0.1048
Epoch 30, Loss: 0.1036
Epoch 31, Loss: 0.0823
Epoch 32, Loss: 0.0582
Epoch 33, Loss: 0.0446
Epoch 34, Loss: 0.0488
Epoch 35, Loss: 0.0366
Epoch 36, Loss: 0.0302
Epoch 37, Loss: 0.0314
Epoch 38, Loss: 0.0287
Epoch 39, Loss: 0.0207
Epoch 40, Loss: 0.0186
Epoch 41, Loss: 0.0127
Epoch 42, Loss: 0.0182
Epoch 43, Loss: 0.0113
Epoch 44, Loss: 0.0

## Save the model

In [106]:
# Specify a directory to save the model
import datetime
date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
path = "/content/drive/MyDrive/CanadianTireChatbot/models"
#path = "."
save_directory = path+"/fine_tuned_BERTsquad_"+date

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_BERTsquad_2024-12-09_06-24-00/tokenizer_config.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_BERTsquad_2024-12-09_06-24-00/special_tokens_map.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_BERTsquad_2024-12-09_06-24-00/vocab.txt',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_BERTsquad_2024-12-09_06-24-00/added_tokens.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_BERTsquad_2024-12-09_06-24-00/tokenizer.json')

## Chatbot

In [107]:
def chatbot(question):
    # Step 1: Retrieve the most relevant context
    retrieved_contexts = retrieve_context(question, vectorizer, context_matrix, contexts)
    context = retrieved_contexts[0]  # Use the top context
    print("Retrieved context:",context)

    # Step 2: Generate the answer
    answer = generate_answer(question, context, tokenizer, model)

    # Step 3: Return the answer
    return answer

## Test the model

In [None]:
import re
from transformers import BertTokenizer, BertForQuestionAnswering, BertTokenizerFast
import torch

save_directory = "/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_BERTsquad_2024-12-09_06-24-00"
# Load the fine-tuned model and tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained(save_directory)

In [116]:
def generate_answer(question, context, tokenizer, model):
    # Tokenize question and context
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device) # Move input_ids to the device
    attention_mask = inputs["attention_mask"].to(device) # Move attention_mask to the device

    # Get the model's predictions
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Identify the answer span
    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1

    # Decode the answer
    answer = tokenizer.decode(input_ids[0][start_idx:end_idx], skip_special_tokens=True)

    return answer

### User input

In [120]:
# Simulate user input
user_question = "What was the income before income taxes for the Financial Services segment in Q3 2024?"

# Get the chatbot's response
response = chatbot(user_question)
print(f"Chatbot: {response}")

Retrieved context: Financial Services segment Income before income taxes was $110.3 million in the quarter, a $15.4 million decrease from the prior year, as higher net write-offs and operating expenses were only partially offset by higher revenues, all while cardholder engagement remained strong.
Chatbot: $ 110. 3 million


### Random question from the dataset

In [122]:
import random

for i in range(5):
  entry = random.choice(dataset)
  context = entry['context']
  question = entry['question']
  answer = entry['answers'][0]['text']

  print(f"Question: {question}")
  response = chatbot(question)
  print(f"Expected Context: {context}")
  print(f"Chatbot: {response}")
  print(f"Expected answer: {answer}")
  print("\n")

Question: What was the consolidated revenue for Canadian Tire in Q3 2024?
Retrieved context: Canadian Tire’s retail business is led by Canadian Tire, founded in 1922.
Expected Context: revenue 41929 million 14 compared 42505 million period last year revenue excluding petroleum1 36398 million decrease 04 compared prior year consolidated income income tax 2993 million 2300 million compared prior year normalized basis consolidated income income tax 330 million diluted eps 359 compared 119 296 normalized basis prior year refer company q3 2024 mda section 41 information normalizing item additional detail event impacted
Chatbot: retail business is led by canadian tire, founded in 1922.
Expected answer: 41929 million


Question: What is the expected range for full-year operating capital expenditures in 2024?
Retrieved context: Total capital expenditures were $195.1 million in the quarter, compared to $176.4 million in Q3 2023. Full-year operating capital expenditures are expected to range bet

### Evaluate

In [128]:
predictions = []
references = []
for entry in test_dataset:
  context = entry['context']
  question = entry['question']
  answer = entry['answers'][0]['text']

  predictions.append(chatbot(question))
  references.append(answer)

print(predictions)
print(references)


Retrieved context: Canadian Tire’s retail business is led by Canadian Tire, founded in 1922.
Retrieved context: Helly Hansen, a leading technical outdoor brand, is owned and operated by Canadian Tire.
Retrieved context: Canadian Tire’s retail business is led by Canadian Tire, founded in 1922.
['canadian tire', 'helly hansen, a leading technical outdoor brand', 'retail business']
['1648 million', 'Canadians are seeking value and finding it through Triangle Rewards', 'is a group of companies that includes a Retail segment, a Financial Services division']


In [129]:
metrics = evaluate_metrics(predictions, references)
print("Evaluation Metrics:", metrics)

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 22.72 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 22.87 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 22.53 sentences/sec
Evaluation Metrics: {'ROUGE-1': 0.041666666666666664, 'ROUGE-2': 0.0, 'ROUGE-L': 0.041666666666666664, 'METEOR': 0.0121654501216545, 'BERT-Precision': 0.4899088541666667, 'BERT-Recall': 0.4032546083132426, 'BERT-F1': 0.4361618061860402}


# Roberta

## Set tokenizer

Tokenize dataset

In [33]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, AdamW
import torch

# Load the pre-trained model and tokenizer
model_name = "deepset/roberta-base-squad2"
RobertaTokenizer = AutoTokenizer.from_pretrained(model_name)
RobertaModel = AutoModelForQuestionAnswering.from_pretrained(model_name)


In [16]:
# Preprocess the dataset
tokenized_data = preprocess_qa_data(dataset, RobertaTokenizer)
print(f"Processed {len(tokenized_data)} samples.")

Processed 42 samples.


## Create dataset and dataloader

In [17]:
# Create Dataset and DataLoader
qa_dataset = QADataset(tokenized_data)
train_dataloader = DataLoader(qa_dataset, batch_size=16, shuffle=True)

## Train Roberta

In [18]:
# Set up optimizer
optimizer = AdamW(RobertaModel.parameters(), lr=1e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaModel.to(device)

epochs = 150
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        # Compute loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader):.4f}")



Epoch 1, Loss: 11.2220
Epoch 2, Loss: 7.8477
Epoch 3, Loss: 5.8152
Epoch 4, Loss: 4.8763
Epoch 5, Loss: 4.4227
Epoch 6, Loss: 4.2426
Epoch 7, Loss: 4.0368
Epoch 8, Loss: 3.9513
Epoch 9, Loss: 3.8195
Epoch 10, Loss: 3.7455
Epoch 11, Loss: 3.6603
Epoch 12, Loss: 3.6581
Epoch 13, Loss: 3.5436
Epoch 14, Loss: 3.5066
Epoch 15, Loss: 3.4181
Epoch 16, Loss: 3.3625
Epoch 17, Loss: 3.2984
Epoch 18, Loss: 3.2405
Epoch 19, Loss: 3.1130
Epoch 20, Loss: 3.0835
Epoch 21, Loss: 2.9082
Epoch 22, Loss: 2.9076
Epoch 23, Loss: 2.7168
Epoch 24, Loss: 2.7261
Epoch 25, Loss: 2.5889
Epoch 26, Loss: 2.3844
Epoch 27, Loss: 2.3199
Epoch 28, Loss: 2.1843
Epoch 29, Loss: 2.0192
Epoch 30, Loss: 1.8082
Epoch 31, Loss: 1.7146
Epoch 32, Loss: 1.4386
Epoch 33, Loss: 1.4308
Epoch 34, Loss: 1.1709
Epoch 35, Loss: 1.0704
Epoch 36, Loss: 0.9808
Epoch 37, Loss: 0.8976
Epoch 38, Loss: 0.8316
Epoch 39, Loss: 0.8002
Epoch 40, Loss: 0.7407
Epoch 41, Loss: 0.7683
Epoch 42, Loss: 0.6213
Epoch 43, Loss: 0.6019
Epoch 44, Loss: 0.4

### Save the model

In [20]:
# Specify a directory to save the model
import datetime
date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
path = "/content/drive/MyDrive/CanadianTireChatbot/models"
#path = "."
save_directory = path+"/fine_tuned_Roberta_"+date

# Save the model and tokenizer
model.save_pretrained(save_directory)
RobertaTokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_Roberta_2024-12-09_05-41-08/tokenizer_config.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_Roberta_2024-12-09_05-41-08/special_tokens_map.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_Roberta_2024-12-09_05-41-08/vocab.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_Roberta_2024-12-09_05-41-08/merges.txt',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_Roberta_2024-12-09_05-41-08/added_tokens.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_Roberta_2024-12-09_05-41-08/tokenizer.json')

### Test the model

In [23]:
save_directory = "/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_Roberta_2024-12-09_05-41-08/"

In [24]:
# Load the fine-tuned model and tokenizer
RobertaTokenizer = RobertaTokenizer.from_pretrained(save_directory)
RobertaModel = AutoModelForQuestionAnswering.from_pretrained(save_directory)

## Chatbot

In [34]:
def chatbot(question):
    # Step 1: Retrieve the most relevant context
    retrieved_contexts = retrieve_context(question, vectorizer, context_matrix, contexts)
    context = retrieved_contexts[0]  # Use the top context
    print("Retrieved context:",context)

    # Step 2: Generate the answer
    answer = generate_answer(question, context, RobertaTokenizer, RobertaModel)

    # Step 3: Return the answer
    return answer

#### User input

In [69]:
# Simulate user input
user_question = "What was the income before income taxes for the Financial Services segment in Q3 2024?"

# Get the chatbot's response
response = chatbot(user_question)
print(f"Chatbot: {response}")

Retrieved context: Financial Services segment Income before income taxes was $110.3 million in the quarter, a $15.4 million decrease from the prior year, as higher net write-offs and operating expenses were only partially offset by higher revenues, all while cardholder engagement remained strong.
Chatbot: 


#### Random questions from the dataset

In [76]:
import random

for i in range(5):
  entry = random.choice(dataset)
  context = entry['context']
  question = entry['question']
  answer = entry['answers'][0]['text']

  print(f"Question: {question}")
  response = chatbot(question)
  print(f"Expected Context: {context}")
  print(f"Chatbot: {response}")
  print(f"Expected answer: {answer}")
  print("\n")

Question: By how much did active loyalty members increase?
Retrieved context: Active loyalty members increased by 4%. Triangle Rewards saw higher engagement with 1:1 offers and mass promotions. In-store Net Promoter Scores (NPS) improved across banners.
Expected Context: Active loyalty members increased by 4%. Triangle Rewards saw higher engagement with 1:1 offers and mass promotions. In-store Net Promoter Scores (NPS) improved across banners.
Chatbot:  4%.
Expected answer: 4%


Question: What was the Retail ROIC at the end of Q3 2024?
Retrieved context: Retail Return on Invested Capital (ROIC) was 8.8% at the end of the third quarter of 2024, compared to 11.1% at the end of the third quarter of 2023.
Expected Context: Retail Return on Invested Capital (ROIC) was 8.8% at the end of the third quarter of 2024, compared to 11.1% at the end of the third quarter of 2023.
Chatbot:  8.8%
Expected answer: 8.8%


Question: How much did AFFO per unit increase in Q3 2024?
Retrieved context: Adjus

## Evaluate

In [77]:
predictions = []
references = []
for entry in test_dataset:
  context = entry['context']
  question = entry['question']
  answer = entry['answers'][0]['text']

  predictions.append(chatbot(question))
  references.append(answer)

print(predictions)
print(references)


Retrieved context: Canadian Tire’s retail business is led by Canadian Tire, founded in 1922.
Retrieved context: Helly Hansen, a leading technical outdoor brand, is owned and operated by Canadian Tire.
Retrieved context: Canadian Tire’s retail business is led by Canadian Tire, founded in 1922.
['', '', ' retail business']
['1648 million', 'Canadians are seeking value and finding it through Triangle Rewards', 'is a group of companies that includes a Retail segment, a Financial Services division']


In [88]:
 metrics = evaluate_metrics(predictions, references)
print("Evaluation Metrics:", metrics)

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.05 seconds, 21.29 sentences/sec




calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.06 seconds, 18.17 sentences/sec




calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 23.77 sentences/sec
Evaluation Metrics: {'ROUGE-1': 0.041666666666666664, 'ROUGE-2': 0.0, 'ROUGE-L': 0.041666666666666664, 'METEOR': 0.0121654501216545, 'BERT-Precision': 0.23449834187825522, 'BERT-Recall': 0.14554868141810098, 'BERT-F1': 0.1796142260233561}


# GPT

## Import Q-A dataset

In [61]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/CanadianTireChatbot/data/QA_df.csv")  # Replace with your actual file path

# Combine question and answer into a single format
df["input_text"] = df["Question"] + " Answer:"
df["output_text"] = df["Answer"]

# Save the processed dataset
df[["input_text", "output_text"]].to_csv("/content/drive/MyDrive/CanadianTireChatbot/data/fine_tuning_data.csv", index=False)
df.head()

Unnamed: 0,Question,Answer,input_text,output_text
0,What was Canadian Tire's diluted EPS in Q3 2024?,$3.59,What was Canadian Tire's diluted EPS in Q3 202...,$3.59
1,What was Canadian Tire's normalized EPS in Q3 ...,$2.96,What was Canadian Tire's normalized EPS in Q3 ...,$2.96
2,What was the percentage decrease in consolidat...,1.5%,What was the percentage decrease in consolidat...,1.5%
3,What was Canadian Tire's annual dividend in 2024?,$7.10 per share,What was Canadian Tire's annual dividend in 20...,$7.10 per share
4,What was the increase in loyalty engagement in...,4%,What was the increase in loyalty engagement in...,4%


In [None]:
df.shape

(61, 4)

## Tokenize

In [None]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Ensure the tokenizer uses the correct padding token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(row):
    # Tokenize the input (Question + Answer)
    input_encodings = tokenizer(
        row["input_text"],
        max_length=512,  # Max length for both input and labels
        padding="max_length",  # Pad to max length
        truncation=True
    )

    # Tokenize the output (Answer)
    target_encodings = tokenizer(
        row["output_text"],
        max_length=512,
        padding="max_length",
        truncation=True
    )

    # Create attention masks manually
    attention_mask = [1 if token != tokenizer.pad_token_id else 0 for token in input_encodings["input_ids"]]

    # Handle the labels by replacing padding tokens with -100 (so they are ignored in loss calculation)
    labels = target_encodings["input_ids"]
    labels = [-100 if token == tokenizer.pad_token_id else token for token in labels]

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": attention_mask,
        "labels": labels
    }



# Apply tokenization
tokenized_data = df.apply(tokenize_function, axis=1)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class QADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(item["labels"], dtype=torch.long),
        }

# Convert to PyTorch Dataset
qa_dataset = QADataset(tokenized_data.to_list())
train_dataloader = DataLoader(qa_dataset, batch_size=4, shuffle=True)


### Train GPT2

In [None]:
from transformers import GPT2LMHeadModel, AdamW

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Adjust for added special tokens


Embedding(50257, 768)

In [None]:
from transformers import AdamW

# Fine-tune GPT-2
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

epochs = 50
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader):.4f}")




Epoch 1, Loss: nan
Epoch 2, Loss: 6.9262
Epoch 3, Loss: 6.0481
Epoch 4, Loss: 5.3148
Epoch 5, Loss: 4.1284
Epoch 6, Loss: 3.3457
Epoch 7, Loss: 2.9115
Epoch 8, Loss: 2.3147
Epoch 9, Loss: 2.2102
Epoch 10, Loss: 2.1298
Epoch 11, Loss: 2.0039
Epoch 12, Loss: 1.8622
Epoch 13, Loss: 1.7672
Epoch 14, Loss: 1.8656
Epoch 15, Loss: 1.6995
Epoch 16, Loss: 1.7391
Epoch 17, Loss: 1.6249
Epoch 18, Loss: 1.6641
Epoch 19, Loss: 1.6465
Epoch 20, Loss: 1.6175
Epoch 21, Loss: 1.4680
Epoch 22, Loss: 1.5304
Epoch 23, Loss: 1.6790
Epoch 24, Loss: 1.6963
Epoch 25, Loss: 1.3747
Epoch 26, Loss: 1.4377
Epoch 27, Loss: 1.8053
Epoch 28, Loss: 1.5435
Epoch 29, Loss: 1.4270
Epoch 30, Loss: 1.3294
Epoch 31, Loss: 1.3849
Epoch 32, Loss: 1.4532
Epoch 33, Loss: 1.4678
Epoch 34, Loss: 1.5186
Epoch 35, Loss: 1.4335
Epoch 36, Loss: 1.3240
Epoch 37, Loss: 1.3121
Epoch 38, Loss: 1.5298
Epoch 39, Loss: 1.3827
Epoch 40, Loss: 1.2611
Epoch 41, Loss: 1.2321
Epoch 42, Loss: 1.3062
Epoch 43, Loss: 1.2773
Epoch 44, Loss: 1.4140


## Save model

In [None]:
# Save the model and tokenizer
model.save_pretrained("/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_gpt2")
tokenizer.save_pretrained("/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_gpt2")


('/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_gpt2/tokenizer_config.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_gpt2/special_tokens_map.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_gpt2/vocab.json',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_gpt2/merges.txt',
 '/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_gpt2/added_tokens.json')

## Load model

In [91]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [93]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/MyDrive/CanadianTireChatbot/models/fine_tuned_gpt2")

model.resize_token_embeddings(len(tokenizer))  # Adjust the model for new tokens

# Move the model to the device
model = model.to(device)

## Test model

In [96]:
def generate_answer(question):
    input_text = f"Question: {question} Answer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Manually create attention mask for the input sequence
    attention_mask = [1 if token != tokenizer.pad_token_id else 0 for token in input_ids[0]]
    attention_mask = torch.tensor([attention_mask]).to(device)

    # Generate output with attention mask
    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,  # Pass the attention mask
        max_length=150,
        num_beams=5,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode the generated output
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    answer = output_text.split("Answer:")[-1].strip()
    return answer



# Example usage
question = "What was Canadian Tire's diluted EPS in Q3 2024?"
answer = generate_answer(question)
print(f"Answer: {answer}")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Answer: Canadian Tire


##### Random question from the dataset

In [97]:
import random

for i in range(5):
  entry = random.choice(dataset)
  context = entry['context']
  question = entry['question']
  r_answer = entry['answers'][0]['text']

  print(f"Question: {question}")
  print(f"Context: {context}")
  answer = generate_answer(question)
  print(f"Answer: {answer}")
  print(f"Expected answer: {r_answer}")
  print("\n")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Question: What is the expected range for full-year operating capital expenditures in 2024?
Context: Total capital expenditures were $195.1 million in the quarter, compared to $176.4 million in Q3 2023. Full-year operating capital expenditures are expected to range between $475 million and $525 million.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Answer: The expected range for full-year operating capital expenditures in 2030 is $1.2 billion to $1.5 billion.

Question: What is the expected range for full-year operating
Expected answer: $475 million and $525 million


Question: How many consecutive years has Canadian Tire increased its annual dividend?
Context: The Company increased its annual dividend for the 15th consecutive year, to $7.10 per Common Voting and Class A Non-Voting Share.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Answer: Canadian Tire has increased its dividend every year since its inception.
Expected answer: 15th


Question: What is the annual dividend per share for Canadian Tire in 2024?
Context: company increased annual dividend 15th consecutive year 710 per common voting class nonvoting share share increase approximately 14 last year november 6 2024 company board director declared dividend 1775 per share payable march 1 2025 shareholder record january 31 2025 dividend considered eligible dividend


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Answer: The dividend per share for Canadian Tire in 2035 will be $0.50.

Question: What is the annual dividend per
Expected answer: 710


Question: By what percentage did GAAR increase in Q3 2024?
Context: Gross Average Accounts Receivable (GAAR) was up 3.0%, mainly as a result of higher average account balances.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Answer: In Q
Expected answer: 3.0%


Question: How many retail and gasoline outlets does Canadian Tire operate?
Context: The Company has close to 1,700 retail and gasoline outlets supported by tens of thousands of employees.
Answer: Canadian Tire operates more than 1,
Expected answer: close to 1,700




## Evaluate

In [99]:
predictions = []
references = []
for entry in test_dataset:
  context = entry['context']
  question = entry['question']
  answer = entry['answers'][0]['text']

  predictions.append(generate_answer(question))
  references.append(answer)

print(predictions)
print(references)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['', 'We are seeking a government that will stand up for the middle class. We are seeking a government that will stand up for the middle class. We are seeking a government that will stand up for the middle class. We are seeking a government that will stand up for the middle class. We are seeking a government that will stand up for the middle class. We are seeking a government that will stand up for the middle class. We are seeking a government that will stand up for the middle class. We are seeking a government that will stand up for the middle class. We are seeking a government that will stand up for the middle class. We are seeking a government that will stand up for the middle class. We', 'Canadian Tire Corporation is a subsidiary of Canadian Imperial Bank of Commerce. Canadian Tire Corporation is a subsidiary of Canadian Imperial Bank of Commerce. Canadian Tire Corporation is a subsidiary of Canadian Imperial Bank of Commerce. Canadian Tire Corporation is a subsidiary of Canadian I

In [None]:
 metrics = evaluate_metrics(predictions, references)
print("Evaluation Metrics:", metrics)

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.05 seconds, 21.29 sentences/sec




calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.06 seconds, 18.17 sentences/sec




calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 23.77 sentences/sec
Evaluation Metrics: {'ROUGE-1': 0.041666666666666664, 'ROUGE-2': 0.0, 'ROUGE-L': 0.041666666666666664, 'METEOR': 0.0121654501216545, 'BERT-Precision': 0.23449834187825522, 'BERT-Recall': 0.14554868141810098, 'BERT-F1': 0.1796142260233561}
