# Data preparation

In [1]:
import pandas as pd
import datasets

from pprint import pprint
from transformers import AutoTokenizer

### Tokenizing text

In [2]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
text = "Hi, how are you?"

In [6]:
encoded_text = tokenizer(text)["input_ids"]

In [7]:
encoded_text

[12764, 13, 849, 403, 368, 32]

In [8]:
decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into text: ", decoded_text)


Decoded tokens back into text:  Hi, how are you?


### Tokenize multiple texts at once

In [9]:
list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])

Encoded several texts:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175], [4374]]


In [10]:
list_texts = ["Hi, how are you?", "I'm good", "Yes"]
tokenizer.pad_token = tokenizer.eos_token
encoded_texts = tokenizer(list_texts,       
                          return_tensors="pt",
                          truncation=True,
                          padding=True,
                          max_length=20)
print("Encoded several texts: ", encoded_texts["input_ids"])

Encoded several texts:  tensor([[12764,    13,   849,   403,   368,    32],
        [   42,  1353,  1175,     0,     0,     0],
        [ 4374,     0,     0,     0,     0,     0]])


In [11]:
decoded_texts = tokenizer.batch_decode(encoded_texts["input_ids"], skip_special_tokens=True)
print("Decoded several texts: ", decoded_texts)

Decoded several texts:  ['Hi, how are you?', "I'm good", 'Yes']


### Padding and truncation

In [12]:
tokenizer.pad_token = tokenizer.eos_token 
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

Using padding:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175, 0, 0, 0], [4374, 0, 0, 0, 0, 0]]


In [13]:
encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])

Using truncation:  [[12764, 13, 849], [42, 1353, 1175], [4374]]


In [14]:
tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(list_texts, max_length=3, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_left["input_ids"])

Using left-side truncation:  [[403, 368, 32], [42, 1353, 1175], [4374]]


In [15]:
encoded_texts_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

Using both padding and truncation:  [[403, 368, 32], [42, 1353, 1175], [4374, 0, 0]]


### Prepare instruction dataset

### Tokenize a single example

In [18]:
help(load_dataset)

Help on function load_dataset in module datasets.load:

load_dataset(path: str, name: Optional[str] = None, data_dir: Optional[str] = None, data_files: Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]], NoneType] = None, split: Union[str, datasets.splits.Split, NoneType] = None, cache_dir: Optional[str] = None, features: Optional[datasets.features.features.Features] = None, download_config: Optional[datasets.download.download_config.DownloadConfig] = None, download_mode: Union[datasets.download.download_manager.DownloadMode, str, NoneType] = None, verification_mode: Union[datasets.utils.info_utils.VerificationMode, str, NoneType] = None, ignore_verifications='deprecated', keep_in_memory: Optional[bool] = None, save_infos: bool = False, revision: Union[str, datasets.utils.version.Version, NoneType] = None, use_auth_token: Union[bool, str, NoneType] = None, task: Union[str, datasets.tasks.base.TaskTemplate, NoneType] = None, streaming: bool = False, num_proc: Optional[int

In [2]:
import pandas as pd

In [6]:
import pyarrow.parquet as pq

In [9]:
from datasets import load_dataset

In [15]:
dataset = load_dataset("E:\AI and Data Science\Coursera\GenAI\FineTunning LLMs\yahoo_answers_qa\default-b8a95242bac33932\1.0.0\62f63c2dc317317049c5a213c97370fe2989ead076488347df250a4b35da10d7")

FileNotFoundError: Couldn't find a dataset script at E:\AI and Data Science\Coursera\GenAI\FineTunning LLMs\yahoo_answers_qa\default-b8a95242bac33932.0.02f63c2dc317317049c5a213c97370fe2989ead076488347df250a4b35da10d7\default-b8a95242bac33932.0.02f63c2dc317317049c5a213c97370fe2989ead076488347df250a4b35da10d7.py or any data file in the same directory.

In [7]:
pq.read_table(

ArrowInvalid: Could not open Parquet input source 'yahoo_answers_qa/default-b8a95242bac33932/1.0.0/62f63c2dc317317049c5a213c97370fe2989ead076488347df250a4b35da10d7/yahoo_answers_qa-train.arrow': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.

In [None]:
import pandas as pd

filename = "lamini_docs.jsonl"
instruction_dataset_df = pd.read_json(filename, lines=True)
examples = instruction_dataset_df.to_dict()

if "question" in examples and "answer" in examples:
  text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
  text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
  text = examples["input"][0] + examples["output"][0]
else:
  text = examples["text"][0]

prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

In [None]:
text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)
print(tokenized_inputs["input_ids"])

1)
print(finetuning_dataset_loaded)

Dataset({
    features: ['question', 'answer'],
    num_rows: 1400

2)
finetuning_dataset_loaded[0]

{'question': "What are the different types of documents available in the repository (e.g., installation guide, API documentation, developer's guide)?",
 'answer': 'Lamini has documentation on Getting Started, Authentication, Question Answer Model, Python Library, Batching, Error Handling, Advanced topics, and class documentation on LLM Engine available at https://lamini-ai.github.io/.'

3)
print(tokenized_dataset)

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    nu:1400python
})}
})

In [None]:
max_length = 2048
max_length = min(
    tokenized_inputs["input_ids"].shape[1],
    max_length,
)

In [None]:
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=max_length
)

In [None]:
tokenized_inputs["input_ids"]

### Tokenize the instruction dataset

In [None]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [None]:
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

In [None]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

### Prepare test/train splits

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

### Some datasets for you to try

In [None]:
finetuning_dataset_path = "lamini/lamini_docs"
finetuning_dataset = datasets.load_dataset(finetuning_dataset_path)
print(finetuning_dataset)

In [None]:
taylor_swift_dataset = "lamini/taylor_swift"
bts_dataset = "lamini/bts"
open_llms = "lamini/open_llms"

In [None]:
dataset_swiftie = datasets.load_dataset(taylor_swift_dataset)
print(dataset_swiftie["train"][1])

In [None]:
# This is how to push your own dataset to your Huggingface hub
# !pip install huggingface_hub
# !huggingface-cli login
# split_dataset.push_to_hub(dataset_path_hf)