### Library Imports

In [15]:
import numpy
import torch
import transformers
import json
import ast

In [16]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

### Load Dataset

In [66]:
dataset = load_dataset("berquetR/dlab_project_optimal_links")

In [79]:
test_dataset = dataset['test']

In [80]:
test_dataset

Dataset({
    features: ['source', 'target', 'current_page', 'current_page_links', 'next_page', '__index_level_0__'],
    num_rows: 26193
})

In [81]:
test_no_prompt = dataset['test']

In [82]:
train_dataset = dataset['train']

In [83]:
train_no_prompt = dataset['train']

### Build Prompt

In [87]:
# Function to format a row according to your fine-tuning requirements
def format_row(row):
    input_data = {
        "Source": row['current_page'], 
        "Candidates": row['current_page_links'], 
        "Target": row['target']
    }
    
    prompt = f"""You are a knowledge discovery expert familiar with the Wikipedia link structure and your objective is to play the game of Wikispeedia: https://dlab.epfl.ch/wikispeedia/play/.
##Goal 
Given two Wikipedia articles, a source and a target, your goal is to reach the target article starting from the source article in as few clicks as possible. For the articles you are given this is always possible.

##Constraint 
You should exclusively follow the links present in the articles that you encounter along the way.

##Fine-grained instructions 
1. While the overall goal is to find a path from a source to a target article, you will proceed step by step.
2. Given outgoing links from the source article as candidates, you should select the candidate that takes you closer to the target article. Use your knowledge of the "expected" Wikipedia link structure and relatedness between articles to identify the candidate that takes you closer to the target.
3. Choose **only** from the provided candidates.
4. Do not provide an algorithm, code to solve the task, or explanation just provide the link the choose among candidates.
6. Even though the proposed links are not related to the target you should **always** choose a link.

##Input 
{json.dumps(input_data, indent=4)}

##Output
You should only respond in the JSON format as described below
Output format:
"thought": "<your short thought on what the user should do next, in a single line>", "next_article": "<chosen article for user to click on>"

This is an example : 
[USER]: "source": "Animal", "target": "China", "links": "Dog;Biology;Eagle;Amazon_Forest"
[ASSISTANT]: "thought": "China has a very strong connection to the United States, and the mascot for the United Statesis the Eagle. There is a link to Eagle on this article, so we should click on it.", "next_article": "Eagle"

"""
    return prompt

In [88]:
# Apply the formatting function to each row
test_dataset = test_dataset.map(lambda x: {"text": format_row(x)})

# You might want to remove the old columns and keep only 'text'
test_dataset = test_dataset.remove_columns(['source', 'target', 'current_page', 'current_page_links', 'next_page', '__index_level_0__'])

Map: 100%|███████████████████████| 26193/26193 [00:03<00:00, 7934.52 examples/s]


In [89]:
train_dataset = train_dataset.map(lambda x: {"text": format_row(x)})

# You might want to remove the old columns and keep only 'text'
train_dataset = train_dataset.remove_columns(['source', 'target', 'current_page', 'current_page_links', 'next_page', '__index_level_0__'])

Map: 100%|███████████████████████| 78088/78088 [00:09<00:00, 8095.76 examples/s]


### Import model

In [50]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [27]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards: 100%|██████████████████| 3/3 [00:45<00:00, 15.19s/it]


In [30]:
tokenizer = AutoTokenizer.from_pretrained(model_id, add_bos_token=True, trust_remote_code=True)

### Test finetuned model

In [94]:
index = 3000

In [95]:
input = test_dataset[index]
input_tok = tokenizer([input['text']],return_tensors = "pt")

In [96]:
model.eval()
with torch.no_grad():
    out = (tokenizer.decode(model.generate(**input_tok, max_new_tokens=100)[0], skip_special_tokens=True))
    print(out)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a knowledge discovery expert familiar with the Wikipedia link structure and your objective is to play the game of Wikispeedia: https://dlab.epfl.ch/wikispeedia/play/.
##Goal 
Given two Wikipedia articles, a source and a target, your goal is to reach the target article starting from the source article in as few clicks as possible. For the articles you are given this is always possible.

##Constraint 
You should exclusively follow the links present in the articles that you encounter along the way.

##Fine-grained instructions 
1. While the overall goal is to find a path from a source to a target article, you will proceed step by step.
2. Given outgoing links from the source article as candidates, you should select the candidate that takes you closer to the target article. Use your knowledge of the "expected" Wikipedia link structure and relatedness between articles to identify the candidate that takes you closer to the target.
3. Choose **only** from the provided candidates.
4. D

In [54]:
test_no_prompt[index]['current_page_links']

"['Tantalum', 'Rhenium', 'Molybdenum', 'Seaborgium', 'List of elements by name', 'Color', 'Xenon', 'Electron', 'Phase (matter)', 'Magnetism', 'Mohs scale of mineral hardness', 'Hafnium', 'Day', 'Tantalum', 'Day', 'Rhenium', 'Chemical element', 'Carbon', 'Steel', 'Carbon', 'Mining', 'Petroleum', 'Gas tungsten arc welding', 'Lead', 'Calcium', 'Magnesium', 'Spain', 'Portugal', 'Molybdenum', 'Cancer', 'Mineral', 'Iron', 'Manganese', 'Oxygen', 'Calcium', 'Bolivia', 'California', 'China', 'Portugal', 'Russia', 'Vietnam', 'South Korea', 'Carbon', 'Oxygen', 'Hafnium']"

In [55]:
is_word_in_links(out_extracted , test_no_prompt[index]['current_page_links'])

NameError: name 'is_word_in_links' is not defined

In [56]:
test_no_prompt[index]['next_page']

'China'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, add_bos_token=True, trust_remote_code=True)