# Dataset preprocessing

In [35]:
import pandas as pd
from itertools import chain
from llama_4bit_wrapper import import_llama, lora_model_zeros_and_scales_to_half
from tqdm import tqdm
import numpy as np
import os

In [2]:
tqdm.pandas()

In [3]:
RANDOM_STATE = 42

### Loading pre-trained model

In [4]:
_, _, load_llama_model_4bit_low_ram, _, _, _, apply_gradient_checkpointing, _, _ = import_llama(
    use_flash_attention=False,
    use_xformers=False,
    autograd_4bit_cuda=False,
    autograd_4bit_triton=True,
)

Using Triton implementation.


In [5]:
_, tokenizer = load_llama_model_4bit_low_ram(
    config_path="../vicuna-13b-GPTQ-4bit-128g/",
    model_path="../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors",
    groupsize=128,
    is_v1_model=False,
)
tokenizer.pad_token_id = 0

Loading Model ...


The safetensors archive passed at ../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


Loaded the model in 3.61 seconds.


### Loading data

In [6]:
df_texts = pd.read_csv("long-vicuna-set/texts.gz", compression="gzip", index_col=0)
df_texts.head()

Unnamed: 0,text,role
0,Can you write a short introduction about the r...,prompter
1,"""Monopsony"" refers to a market structure where...",assistant
2,Now explain it to a dog,prompter
3,Monopsony is a market structure in which there...,assistant
4,How can one fight back when a monospony had be...,prompter


In [7]:
df_indices_train = pd.read_pickle("long-vicuna-set/indices-train.pkl")
df_indices_train.head()

Unnamed: 0,indices,source
0,"[0, 1, 2]",openassistant
1,"[0, 3, 4]",openassistant
2,"[0, 5, 6, 7]",openassistant
3,"[0, 5, 6, 8]",openassistant
4,"[0, 5, 6, 9]",openassistant


In [8]:
df_indices_validation = pd.read_pickle("long-vicuna-set/indices-validation.pkl")
df_indices_validation.head()

Unnamed: 0,indices,source
0,"[82483, 82484]",openassistant
1,"[82483, 82485]",openassistant
2,"[82483, 82486]",openassistant
3,"[82487, 82488]",openassistant
4,"[82487, 82489, 82490, 82491]",openassistant


In [9]:
df_indices_train["source"].value_counts()

source
gpt4all          364864
alpaca            49401
openassistant     42711
govreport         18490
booksum           11031
qasper             6229
Name: count, dtype: int64

In [10]:
df_indices_train["source"].value_counts() / df_indices_train.shape[0]

source
gpt4all          0.740501
alpaca           0.100261
openassistant    0.086683
govreport        0.037526
booksum          0.022388
qasper           0.012642
Name: count, dtype: float64

In [11]:
df_indices_validation["source"].value_counts()

source
gpt4all          19204
alpaca            2601
openassistant     2166
qasper            1764
booksum           1484
govreport          973
Name: count, dtype: int64

In [12]:
df_indices_validation["source"].value_counts() / df_indices_validation.shape[0]

source
gpt4all          0.681186
alpaca           0.092260
openassistant    0.076830
qasper           0.062571
booksum          0.052639
govreport        0.034513
Name: count, dtype: float64

### Reducing gpt4all usage

Since GPT4All is a syntetic dataset - let's reduce it's usage. It's good for "let's follow this instruction format" training but we can't be sure about the quality.

In [13]:
def _get_dataset_fraction(df, source, fraction, random_state):
    df_filtered = df.loc[df["source"] == source]
    df_filtered = df_filtered.sample(int(df_filtered.shape[0] * fraction), random_state=random_state)
    return df_filtered

In [14]:
def _limit_dataset_fraction(df, source, fraction, random_state):
    df_filtered = df.loc[df["source"] != source]
    df = pd.concat([df_filtered, _get_dataset_fraction(df, source, fraction, random_state)])
    return df.loc[sorted(df.index)]

In [15]:
df_indices_train = _limit_dataset_fraction(df_indices_train, "gpt4all", 0.1, RANDOM_STATE)
df_indices_validation = _limit_dataset_fraction(df_indices_validation, "gpt4all", 0.1, RANDOM_STATE)

In [16]:
df_indices_train["source"].value_counts()

source
alpaca           49401
openassistant    42711
gpt4all          36486
govreport        18490
booksum          11031
qasper            6229
Name: count, dtype: int64

In [17]:
df_indices_validation["source"].value_counts()

source
alpaca           2601
openassistant    2166
gpt4all          1920
qasper           1764
booksum          1484
govreport         973
Name: count, dtype: int64

### Removing unnecessary texts

In [18]:
text_indices_to_process = sorted(set(chain(*df_indices_train["indices"], *df_indices_validation["indices"])))
len(df_texts), len(text_indices_to_process)

(1040439, 347160)

In [19]:
df_texts = df_texts.loc[text_indices_to_process]
df_texts.head()

Unnamed: 0,text,role
0,Can you write a short introduction about the r...,prompter
1,"""Monopsony"" refers to a market structure where...",assistant
2,Now explain it to a dog,prompter
3,Monopsony is a market structure in which there...,assistant
4,How can one fight back when a monospony had be...,prompter


### Tokenization

In [20]:
tokenizer.convert_ids_to_tokens(
    tokenizer("TEST")["input_ids"]
)

['<s>', '▁TE', 'ST']

In [21]:
tokenizer.vocab_size

32000

In [22]:
2 ** 16

65536

In [23]:
df_texts["processed_text"] = "<msg_" + df_texts["role"] + "> " + df_texts["text"].fillna("").str.strip()
df_texts = df_texts[["processed_text"]]

In [24]:
df_texts["input_ids"] = df_texts["processed_text"].fillna("").progress_apply(
    lambda text: np.array(tokenizer(text)["input_ids"][1:], dtype=np.uint16),
)

100%|██████████| 347160/347160 [23:52<00:00, 242.29it/s] 


In [25]:
df_texts.head()

Unnamed: 0,processed_text,input_ids
0,<msg_prompter> Can you write a short introduct...,"[529, 7645, 29918, 14032, 29886, 357, 29958, 1..."
1,"<msg_assistant> ""Monopsony"" refers to a market...","[529, 7645, 29918, 465, 22137, 29958, 376, 718..."
2,<msg_prompter> Now explain it to a dog,"[529, 7645, 29918, 14032, 29886, 357, 29958, 2..."
3,<msg_assistant> Monopsony is a market structur...,"[529, 7645, 29918, 465, 22137, 29958, 2598, 45..."
4,<msg_prompter> How can one fight back when a m...,"[529, 7645, 29918, 14032, 29886, 357, 29958, 1..."


### Adding lengths

In [26]:
df_texts["length"] = df_texts["input_ids"].apply(len)
df_texts.head()

Unnamed: 0,processed_text,input_ids,length
0,<msg_prompter> Can you write a short introduct...,"[529, 7645, 29918, 14032, 29886, 357, 29958, 1...",50
1,"<msg_assistant> ""Monopsony"" refers to a market...","[529, 7645, 29918, 465, 22137, 29958, 376, 718...",351
2,<msg_prompter> Now explain it to a dog,"[529, 7645, 29918, 14032, 29886, 357, 29958, 2...",13
3,<msg_assistant> Monopsony is a market structur...,"[529, 7645, 29918, 465, 22137, 29958, 2598, 45...",238
4,<msg_prompter> How can one fight back when a m...,"[529, 7645, 29918, 14032, 29886, 357, 29958, 1...",22


In [33]:
(df_texts["processed_text"].str.len() / df_texts["length"]).mean()

3.7351822430642714

In [28]:
df_indices_train["session_length"] = df_indices_train["indices"].apply(lambda indices: df_texts["length"][indices].sum())
df_indices_validation["session_length"] = df_indices_validation["indices"].apply(lambda indices: df_texts["length"][indices].sum())

In [29]:
df_indices_train.head()

Unnamed: 0,indices,source,session_length
0,"[0, 1, 2]",openassistant,414
1,"[0, 3, 4]",openassistant,310
2,"[0, 5, 6, 7]",openassistant,426
3,"[0, 5, 6, 8]",openassistant,595
4,"[0, 5, 6, 9]",openassistant,334


In [30]:
df_indices_validation.head()

Unnamed: 0,indices,source,session_length
0,"[82483, 82484]",openassistant,302
1,"[82483, 82485]",openassistant,218
2,"[82483, 82486]",openassistant,79
3,"[82487, 82488]",openassistant,561
4,"[82487, 82489, 82490, 82491]",openassistant,546


In [31]:
df_indices_train.groupby("source")["session_length"].mean()

source
alpaca             141.629704
booksum           6992.120479
govreport        12217.592212
gpt4all            481.274407
openassistant      508.353656
qasper            5660.732541
Name: session_length, dtype: float64

In [32]:
df_indices_train.groupby("source")["session_length"].quantile(0.9)

source
alpaca             217.0
booksum          13749.0
govreport        21187.4
gpt4all           1064.0
openassistant     1039.0
qasper            8598.0
Name: session_length, dtype: float64

### Saving the processed set

In [36]:
os.makedirs("long-vicuna-set-lessgpt4all-vicuna13b-processed", exist_ok=True)
df_texts.to_pickle("long-vicuna-set-lessgpt4all-vicuna13b-processed/texts.pkl")
df_indices_train.to_pickle("long-vicuna-set-lessgpt4all-vicuna13b-processed/indices-train.pkl")
df_indices_validation.to_pickle("long-vicuna-set-lessgpt4all-vicuna13b-processed/indices-validation.pkl")