In [6]:
import json
import re

def process_conversation(text):
    # Remove example header (e.g., "# Example 1 (normal)")
    text = re.sub(r'# Example \d+ \((normal|multi-turn)\)\n', '', text)
    # Split into lines
    lines = text.strip().split('\n')
    formatted = []
    for line in lines:
        if line.startswith("User: "):
            formatted.append(f"<|endoftext|>USER:{line[6:]}")
        elif line.startswith("Bot: "):
            formatted.append(f"<|endoftext|>BOT:{line[5:]}")
    # Join with no extra spaces and add end token
    return ''.join(formatted) + "<|endoftext|>"

def convert_txt_to_jsonl(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
        examples = re.split(r'\n(?=# Example)', content)

    with open(output_file, 'w', encoding='utf-8') as f_out:
        for example in examples:
            if example.strip():
                formatted_text = process_conversation(example)
                f_out.write(json.dumps({"text": formatted_text}) + '\n')


input_file = "/content/drive/MyDrive/ztrios-project/shop_chat.txt"
output_file = "/content/drive/MyDrive/ztrios-project/shop_chat.jsonl"
convert_txt_to_jsonl(input_file, output_file)

In [2]:
!pip install transformers



In [4]:
!pip install datasets torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


dataset = load_dataset(
    "json",
    data_files="/content/shop_chat.jsonl",
    split="train",
    cache_dir=None
)


train_size = int(0.8 * len(dataset))  # 8 examples
val_size = len(dataset) - train_size  # 2 examples
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, len(dataset)))


dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )


tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])

def add_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples


tokenized_datasets = tokenized_datasets.map(add_labels, batched=True)


tokenized_datasets.save_to_disk("/content/drive/MyDrive/ztrios-project")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]

In [11]:
!rm -rf ~/.cache/huggingface/datasets
!rm -rf /content/drive/MyDrive/ztrios-project/cache

In [2]:
!cp "/content/drive/MyDrive/ztrios-project/shop_chat.jsonl" "/content/shop_chat.jsonl"

In [4]:
import os
print(os.path.exists("/content/shop_chat.jsonl"))

True


In [7]:
!pip install --upgrade datasets transformers

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [2]:
from datasets import load_from_disk


dataset_dict = load_from_disk("/content/drive/MyDrive/ztrios-project")

print(dataset_dict)

print("Train example:", dataset_dict["train"][0])
print("Validation example:", dataset_dict["validation"][0])

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})
Train example: {'input_ids': [50256, 29904, 25, 5211, 345, 3677, 19450, 3056, 30, 50256, 33, 2394, 25, 5297, 11, 356, 423, 262, 867, 3858, 286, 19450, 3056, 287, 674, 4283, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50

In [3]:
print("Train dataset features:", dataset_dict["train"].features)
print("Train dataset size:", len(dataset_dict["train"]))
print("Validation dataset size:", len(dataset_dict["validation"]))

Train dataset features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
Train dataset size: 8
Validation dataset size: 2


In [4]:
print("Pad token:", tokenizer.pad_token)
print("Pad token ID:", tokenizer.pad_token_id)

Pad token: <|endoftext|>
Pad token ID: 50256


In [5]:
for split in ["train", "validation"]:
    for i, example in enumerate(dataset_dict[split]):
        if not example["input_ids"] or len(example["input_ids"]) == 0:
            print(f"Empty input_ids in {split} at index {i}")

In [6]:
for split in ["train", "validation"]:
    lengths = [len(example["input_ids"]) for example in dataset_dict[split]]
    print(f"{split} input_ids lengths: {set(lengths)}")

train input_ids lengths: {256}
validation input_ids lengths: {256}
