In [1]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/projet')
import os
os.chdir('/content/drive/MyDrive/projet')

Mounted at /content/drive


In [2]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m1.0/1.2 MB[0m [31m32.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gpt_download import download_and_load_gpt2
from previous_labs import (
    create_dataloader_v1,
    calc_loss_loader,
    generate,
    GPTModel,
    load_weights_into_gpt,
    text_to_token_ids,
    train_model_simple,
    token_ids_to_text,
)
import tiktoken
import torch

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()
GPT_124M = {
        "vocab_size": 50257,     # Vocabulary size
        "context_length": 1024,  # Context length
        "drop_rate": 0.0,        # Dropout rate
        "qkv_bias": True,         # Query-key-value bias
        "emb_dim": 768,
        "n_layers": 12,
        "n_heads": 12
    }

model_size = '124M'
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
model = GPTModel(GPT_124M)
load_weights_into_gpt(model, params)
model.eval()
model.to(device)
tokenizer = tiktoken.get_encoding("gpt2")

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [5]:
!cd /content/drive/MyDrive/projet
df = pd.read_json("final_dataset_clean.json")
df.shape
df = df.sample(frac=0.5, random_state=42)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22170 entries, 1162 to 31430
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  22170 non-null  object
 1   response     22170 non-null  object
dtypes: object(2)
memory usage: 519.6+ KB


I choosed a finance dataset, who can help me in everyday questions on what is safe to do on financial subject.
Dataset found on Hugging face, https://huggingface.co/datasets/gbharti/wealth-alpaca_lora

In [7]:
#Refine into <instruction, response>, the dataset is already using Alpaca-style
df = df.rename(columns={'output': 'response'})

In [8]:
df.head()

Unnamed: 0,instruction,response
1162,When should I walk away from my mortgage?,"This is a very personal situation of course, b..."
2279,What makes a Company's Stock prices go up or d...,I always liked the answer that in the short te...
42511,Describe the key features of a REST API.,A REST API is a type of API architecture that ...
40674,Generate a response that conveys an appropriat...,I apologize for the misunderstanding. Is there...
26379,Name three sciences related to computing.,Three sciences related to computing are comput...


In [9]:
import json
from torch.utils.data import Dataset,DataLoader

In [10]:
df.head()

Unnamed: 0,instruction,response
1162,When should I walk away from my mortgage?,"This is a very personal situation of course, b..."
2279,What makes a Company's Stock prices go up or d...,I always liked the answer that in the short te...
42511,Describe the key features of a REST API.,A REST API is a type of API architecture that ...
40674,Generate a response that conveys an appropriat...,I apologize for the misunderstanding. Is there...
26379,Name three sciences related to computing.,Three sciences related to computing are comput...


In [11]:
def format_input(entry):
    instruction_text = f"### Instruction:\n{entry['instruction']}\n\n### Input:\n"
    return instruction_text

In [12]:
def collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=1024,
    device = 'cpu'
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [13]:
import random
#df.to_json("final_dataset_clean2.json")
with open("final_dataset_clean.json", "r", encoding="utf-8") as file:
    data = json.load(file)

#data = random.sample(data, k=len(data) // 2)
train_size = int(0.8 * len(data))
train_df = data[:train_size]
valid_df = data[train_size:]

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['response']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

CustomDataset = InstructionDataset
train_dataset = CustomDataset(train_df, tokenizer)
valid_dataset = CustomDataset(valid_df, tokenizer)

In [14]:
torch.manual_seed(123)
from torch.nn.utils.rnn import pad_sequence
train_loader = DataLoader(
    train_dataset,
    batch_size=2,
    drop_last=True,
    shuffle=True,
    num_workers=0,
    collate_fn=lambda batch: collate_fn(batch,device = device)
)

val_loader = DataLoader(
    valid_dataset,
    batch_size=2,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=lambda batch: collate_fn(batch,device = device)
)

In [15]:
train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("train_loss : ",train_loss)
print("val_loss : ", val_loss)

train_loss :  3.5319506168365478
val_loss :  3.096492338180542


In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

In [17]:
train_losses, val_losses, tokens_seen = train_model_simple(
        model, train_loader, val_loader, optimizer, device,
        num_epochs=2, eval_freq=50, eval_iter=50,
        start_context=format_input(valid_df[0]), tokenizer=tokenizer
    )

Ep 1 (Step 000000): Train loss 3.485, Val loss 3.176
Ep 1 (Step 000050): Train loss 2.886, Val loss 2.163
Ep 1 (Step 000100): Train loss 2.700, Val loss 2.120
Ep 1 (Step 000150): Train loss 2.716, Val loss 2.107
Ep 1 (Step 000200): Train loss 2.780, Val loss 2.080
Ep 1 (Step 000250): Train loss 2.789, Val loss 2.077
Ep 1 (Step 000300): Train loss 2.761, Val loss 2.074
Ep 1 (Step 000350): Train loss 2.806, Val loss 2.074
Ep 1 (Step 000400): Train loss 2.666, Val loss 2.051
Ep 1 (Step 000450): Train loss 2.724, Val loss 2.039
Ep 1 (Step 000500): Train loss 2.650, Val loss 2.035
Ep 1 (Step 000550): Train loss 2.798, Val loss 2.036
Ep 1 (Step 000600): Train loss 2.763, Val loss 2.026
Ep 1 (Step 000650): Train loss 2.730, Val loss 2.026
Ep 1 (Step 000700): Train loss 2.687, Val loss 2.023
Ep 1 (Step 000750): Train loss 2.643, Val loss 2.008
Ep 1 (Step 000800): Train loss 2.654, Val loss 2.015
Ep 1 (Step 000850): Train loss 2.619, Val loss 2.018
Ep 1 (Step 000900): Train loss 2.709, Val loss

In [18]:
print(f"Taille du dataset d'entraînement : {len(train_dataset)}")
print(f"Batch size utilisé : {train_loader.batch_size}")


Taille du dataset d'entraînement : 35472
Batch size utilisé : 2


In [19]:
train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("train_loss after finetuning : ",train_loss)
print("val_loss after finetuning : ", val_loss)

train_loss after finetuning :  2.4190662145614623
val_loss after finetuning :  1.8173839092254638
