## Check GPU

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Huggingface Login

In [2]:
os.environ["HF_TOKEN_INDIE"] = ""
from huggingface_hub import login
login(token=os.environ["HF_TOKEN_INDIE"])

## Load Teacher Model

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

TEACHER = 'mistralai/Mistral-7B-Instruct-v0.3'


# Define quantization config for 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load teacher model (Mistral-7B-Instruct) with quantization config
teacher_model = AutoModelForCausalLM.from_pretrained(
    TEACHER,
    device_map="auto",
    quantization_config=bnb_config
)
teacher_tokenizer = AutoTokenizer.from_pretrained(TEACHER)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Load Dataset

In [4]:
from datasets import load_dataset

LOW = 23000
HIGH = 24000

raw_dataset = load_dataset('csv', data_files='sample/dialogueText.csv')
raw_data = raw_dataset['train'].select(range(LOW, HIGH))

def format_prompt(row):
    return f'User: {row['text']}\nAssistant:'

## Generate teacher responses

In [5]:
def generate_teacher_response(prompt, max_new_tokens=128):
    inputs = teacher_tokenizer(prompt, return_tensors="pt").to(device=device)
    with torch.no_grad():
        outputs = teacher_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=teacher_tokenizer.eos_token_id
        )
    return teacher_tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1].strip()

## Distill Dataset

In [6]:
from datasets import Dataset
from tqdm import tqdm


examples = []
for row in tqdm(raw_data, desc="Distilling"):
    prompt = format_prompt(row)
    response = generate_teacher_response(prompt)
    examples.append({"prompt": prompt, "response": response})

# examples = []
# for row in raw_data:
#     prompt = format_prompt(row)
#     response = generate_teacher_response(prompt)
#     examples.append({'prompt': prompt, 'response': response})

Distilling: 100%|██████████| 1000/1000 [2:14:01<00:00,  8.04s/it] 


## Save distilled dataset to disk

In [7]:
import pandas as pd

distilled_df = pd.DataFrame(examples)
distilled_df.to_csv(f"sample/distilled_dataset_{LOW}_{HIGH-1}.csv", index=False)