### Library imports

In [1]:
import numpy
import torch
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

In [3]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

In [4]:
from trl import SFTTrainer

In [5]:
torch.cuda.get_device_name(0)

'NVIDIA A100-SXM4-40GB'

### Data imports

In [6]:
dataset = load_dataset("berquetR/dlab_project_optimal_links")


In [7]:
train_dataset = dataset['train']
validation_dataset = dataset['validation']

In [8]:
train_dataset

Dataset({
    features: ['source', 'target', 'current_page', 'current_page_links', 'next_page', '__index_level_0__'],
    num_rows: 78088
})

In [9]:
validation_dataset

Dataset({
    features: ['source', 'target', 'current_page', 'current_page_links', 'next_page', '__index_level_0__'],
    num_rows: 26178
})

### Build Prompt Version


In [10]:
# Function to format a row according to your fine-tuning requirements
def format_row(row):
    input_data = {
        "Source": row['current_page'], 
        "Candidates": row['current_page_links'], 
        "Target": row['target']
    }
    output_data = {
        "Output": row['next_page']
    }
    
    prompt = f"""You are a knowledge discovery expert familiar with the Wikipedia link structure and your objective is to play the game of Wikispeedia: https://dlab.epfl.ch/wikispeedia/play/.
##Goal 
Given two Wikipedia articles, a source and a target, your goal is to reach the target article starting from the source article in as few clicks as possible. For the articles you are given this is always possible.

##Constraint 
You should exclusively follow the links present in the articles that you encounter along the way.

##Fine-grained instructions 
1. While the overall goal is to find a path from a source to a target article, you will proceed step by step.
2. Given outgoing links from the source article as candidates, you should select the candidate that takes you closer to the target article. Use your knowledge of the "expected" Wikipedia link structure and relatedness between articles to identify the candidate that takes you closer to the target.
3. Choose **only** from the provided candidates.
4. Do not provide an algorithm or code to solve the task, instead give only the solution.

##Input 
{json.dumps(input_data, indent=4)}

##Output 
{json.dumps(output_data, indent=4)}

"""
    return prompt


In [11]:
# Apply the formatting function to each row
train_dataset = train_dataset.map(lambda x: {"text": format_row(x)})

# You might want to remove the old columns and keep only 'text'
train_dataset = train_dataset.remove_columns(['source', 'target', 'current_page', 'current_page_links', 'next_page', '__index_level_0__'])

In [12]:
# Apply the formatting function to each row
validation_dataset = validation_dataset.map(lambda x: {"text": format_row(x)})

# You might want to remove the old columns and keep only 'text'
validation_dataset = validation_dataset.remove_columns(['source', 'target', 'current_page', 'current_page_links', 'next_page', '__index_level_0__'])

### Accelerator

In [13]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

### Model import 

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [15]:
model_id = "microsoft/phi-1_5"
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

In [16]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path = "microsoft/phi-1_5",
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

### Lora

In [17]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [18]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

In [19]:
model = accelerator.prepare_model(model)

### Training

In [20]:
output_dir = './phi_1_5_first_run'

In [21]:
training_arguments = TrainingArguments(
        output_dir = output_dir, 
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 1,
        
        warmup_steps = 5,
        learning_rate = 2e-5,
    
        #Keep precision at 32bit for training
        bf16 = True,
        fp16 = False, 
        #Perform eval every save_steps
        do_eval = True,
        #Saving based on number of steps
        save_strategy = 'steps',
        evaluation_strategy = 'steps',
        #Number of steps model checkpoints will be saved and evaluated
        save_steps = 1000,
        eval_steps = 1000,
        #Start reporting loss
        logging_steps = 1,
        #Use paged optimizer for memory efficiency
        optim = "paged_adamw_8bit", 
        #2 epochs
        num_train_epochs = 2, 
    )

In [27]:
training_arguments_test = TrainingArguments(
        output_dir = output_dir, 
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 1,
        
        warmup_steps = 5,
        learning_rate = 2e-5,
    
        #Keep precision at 32bit for training
        bf16 = True,
        fp16 = False, 
        #Perform eval every save_steps
        do_eval = True,
        #Saving based on number of steps
        save_strategy = 'steps',
        evaluation_strategy = 'steps',
        #Number of steps model checkpoints will be saved and evaluated
        save_steps = 10,
        eval_steps = 5,
        max_steps = 30,
        #Start reporting loss
        logging_steps = 1,
        #Use paged optimizer for memory efficiency
        optim = "paged_adamw_8bit"
        #2 epochs
    )

In [22]:
val_dataset = validation_dataset.shuffle(seed=42).select(range(10000))

In [23]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset= val_dataset,
    dataset_text_field = "text",
    max_seq_length = 1024,
    args = training_arguments,
    peft_config = config,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:34<00:00, 286.05 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
1000,0.5289,0.646747


