In [4]:
import jsonlines
import itertools
from pprint import pprint

from datasets import load_dataset

In [2]:
instruction_tuned_dataset = load_dataset(
    "tatsu-lab/alpaca", split="train", streaming=True
)

Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

In [5]:
m = 10
print("Instruction-tuned dataset:")
top_m = list(itertools.islice(instruction_tuned_dataset, m))
for j in top_m:
  pprint(j)
  break 

Instruction-tuned dataset:
{'input': '',
 'instruction': 'Give three tips for staying healthy.',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.',
 'text': 'Below is an instruction that describes a task. Write a response that '
         'appropriately completes the request.\n'
         '\n'
         '### Instruction:\n'
         'Give three tips for staying healthy.\n'
         '\n'
         '### Response:\n'
         '1.Eat a balanced diet and make sure to include plenty of fruits and '
         'vegetables. \n'
         '2. Exercise regularly to keep your body active and strong. \n'
         '3. Get enough sleep and maintain a consistent sleep schedule.'}


Prompt Templates

In [6]:
prompt_template_with_input = """Below is an instruction that describes a task, paired with an input 
that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""

prompt_template_without_input = """Below is an instruction that describes a task. 
Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

In [13]:
def process_data(data_dict: dict) -> dict:
    if not data_dict["input"]:
        processed_prompt = prompt_template_without_input.format(
            instruction=data_dict["instruction"]
        )
    else:
        processed_prompt = prompt_template_with_input.format(
            instruction=data_dict["instruction"], input=data_dict["input"]
        )

    return {"input": processed_prompt, "output": data_dict["output"]}

In [16]:
processed_data = list(map(process_data, top_m))

pprint(processed_data[0])

{'input': 'Below is an instruction that describes a task. \n'
          'Write a response that appropriately completes the request.\n'
          '\n'
          '### Instruction:\n'
          'Give three tips for staying healthy.\n'
          '\n'
          '### Response:',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.'}


Save data to json

In [17]:
with jsonlines.open(f'json/alpaca_processed.jsonl', 'w') as writer:
    writer.write_all(processed_data)

In [18]:
import pandas as pd
from datasets import Dataset

finetuning_dataset = Dataset.from_pandas(pd.DataFrame(data=processed_data))
finetuning_dataset.push_to_hub("alliwene/test-alpaca")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [26]:
test = load_dataset("alliwene/test-alpaca", split="train")
test 

Dataset({
    features: ['input', 'output'],
    num_rows: 10
})

In [23]:
pprint(test[7])

{'input': 'Below is an instruction that describes a task. \n'
          'Write a response that appropriately completes the request.\n'
          '\n'
          '### Instruction:\n'
          'Write a short story in third person narration about a protagonist '
          'who has to make an important career decision.\n'
          '\n'
          '### Response:',
 'output': 'John was at a crossroads in his life. He had just graduated '
           'college and was now facing the big decision of what career to '
           'pursue. After much deliberation, he decided that he wanted to be '
           'an accountant and help the financially disadvantaged. He had '
           'always been good with numbers and enjoyed seeing the tangible '
           'results of his work. \n'
           '\n'
           'John enrolled in accounting courses and initially found it quite '
           'challenging. He had to learn multiple systems and regulations '
           'quickly, but he worked hard and eventu