In [14]:
import numpy as np
import pandas as pd

from datasets import Dataset, load_dataset

df = pd.read_csv("train_MK_books.csv")

train = Dataset.from_pandas(df)

In [4]:
train

Dataset({
    features: ['text'],
    num_rows: 4212
})

In [50]:
np.random.seed(42)
rand_ixs = np.random.randint(0, len(train), 400)
sample_test = train.select(rand_ixs)
sample_test, rand_ixs

(Dataset({
     features: ['text'],
     num_rows: 400
 }),
 array([ 860, 3772, 3092,  466, 3444, 3171, 2919,  130, 1685,  769, 2391,
        2433, 1184, 3385, 4117, 2904,  474, 1082, 2558, 2047, 2747,  975,
        1806,  189, 2734, 3005, 1899, 1267, 1528, 3202, 3556, 3890,  646,
        2888, 2435,  600, 2363, 2061,  241, 2041, 2824, 2612, 1363, 1478,
        2556,  775, 4014,   34, 3152, 1955, 1585, 3943, 3073, 1021, 3461,
        2613, 3843, 1500,  161, 1981,  995, 3342, 3798, 1275, 1016,  337,
         878, 1076, 3993,  379,  492, 2062, 3884,   64, 2568, 2027, 2695,
        1495,  391, 3561, 2278, 3099,  200, 3104, 2454, 3645,  804, 2731,
        2773, 1570, 2690, 3840, 1028,  502,  870,  206, 1484,  863, 2790,
         563, 4191, 1757, 1678, 3242, 1059, 1722, 3314, 3157, 2625, 2729,
        1597, 3060, 2693, 3627, 1363, 1981, 1663, 1529, 2038, 3302, 2237,
        1306, 4029, 2675, 1282,  709, 3748,  663, 1998, 3445, 3743, 1495,
        3304, 3763, 1853, 1291, 3581, 3457, 1636, 36

In [51]:
test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [52]:
test

Dataset({
    features: ['text'],
    num_rows: 4358
})

In [54]:
# encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")


In [61]:
"\n\n".join(test["text"][:20])

'\n\n = Robert Boulter = \n\n\n\n\n Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n\n\n In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He 

In [64]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from random import randrange
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments

from tqdm import tqdm


df = pd.read_csv("train_MK_books.csv")

train = Dataset.from_pandas(df)

model_id = "ilufy/meta-llama-2-7b-mk-physics-domain-tuned-2500"

# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto")


# Select random indices for metric evaluation
np.random.seed(42)
rand_ixs = np.random.randint(0, len(train), 400)
sample_test = train.select(rand_ixs)

print("Test data: ", sample_test, rand_ixs)

encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")


max_length = model.config.max_length
stride = 512
seq_len = encodings.input_ids.size(1)
device = "cuda"

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())

print("[Preplexity]: ppl = ", ppl)





tokenizer_config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

Test data:  Dataset({
    features: ['text'],
    num_rows: 400
}) [ 860 3772 3092  466 3444 3171 2919  130 1685  769 2391 2433 1184 3385
 4117 2904  474 1082 2558 2047 2747  975 1806  189 2734 3005 1899 1267
 1528 3202 3556 3890  646 2888 2435  600 2363 2061  241 2041 2824 2612
 1363 1478 2556  775 4014   34 3152 1955 1585 3943 3073 1021 3461 2613
 3843 1500  161 1981  995 3342 3798 1275 1016  337  878 1076 3993  379
  492 2062 3884   64 2568 2027 2695 1495  391 3561 2278 3099  200 3104
 2454 3645  804 2731 2773 1570 2690 3840 1028  502  870  206 1484  863
 2790  563 4191 1757 1678 3242 1059 1722 3314 3157 2625 2729 1597 3060
 2693 3627 1363 1981 1663 1529 2038 3302 2237 1306 4029 2675 1282  709
 3748  663 1998 3445 3743 1495 3304 3763 1853 1291 3581 3457 1636 3696
 2999 3152  698 2160 4097  854 3474 1707 2777 1733 3510  202 3255  766
 2327 2931  197 1930 3582  608 3272 1147 3397 2511 1794  659 2811 1369
 1986  146 3219 2911 1734 1843  488 2976 1959 2385 2919 1802 4061 3369
  262  623

In [1]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")


In [68]:
model.config.max_length

20

In [3]:
import datasets
datasets.__version__


'2.19.1'

In [5]:
help(datasets.load_dataset)

Help on function load_dataset in module datasets.load:

load_dataset(path: str, name: Optional[str] = None, data_dir: Optional[str] = None, data_files: Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]], NoneType] = None, split: Union[str, datasets.splits.Split, NoneType] = None, cache_dir: Optional[str] = None, features: Optional[datasets.features.features.Features] = None, download_config: Optional[datasets.download.download_config.DownloadConfig] = None, download_mode: Union[datasets.download.download_manager.DownloadMode, str, NoneType] = None, verification_mode: Union[datasets.utils.info_utils.VerificationMode, str, NoneType] = None, ignore_verifications='deprecated', keep_in_memory: Optional[bool] = None, save_infos: bool = False, revision: Union[str, datasets.utils.version.Version, NoneType] = None, token: Union[bool, str, NoneType] = None, use_auth_token='deprecated', task='deprecated', streaming: bool = False, num_proc: Optional[int] = None, storage_options: Opt

In [66]:
encodings

{'input_ids': tensor([[    1, 29871,    13,  ...,    13,    13,    13]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}