In [1]:
import pandas as pd

df = pd.read_csv('sample/ubuntu_context_1000.csv')


In [2]:
df

Unnamed: 0.1,Unnamed: 0,dialogueID,date,from,to,text
0,100015,12077.tsv,2005-01-01T13:08:00.000Z,anir,,hope we will get something better linux in the...
1,100016,12077.tsv,2005-01-01T13:09:00.000Z,anir,,now Window is coming with LongHorn... its great..
2,100017,12077.tsv,2005-01-01T13:10:00.000Z,anir,,i have just tried transformation package in XP...
3,100018,12077.tsv,2005-01-01T13:10:00.000Z,anir,,now the ball is in your court LINUX..
4,100019,12077.tsv,2005-01-01T13:11:00.000Z,anir,,what happened guys.. say something
...,...,...,...,...,...,...
982,8944250,12317.tsv,2008-12-17T02:58:00.000Z,Aberation,djbushdio,nothing happens when I directly open cdrom0
983,8944251,12317.tsv,2008-12-17T02:59:00.000Z,Aberation,djbushdio,WAIT! I got something when opening cdrom...but...
984,8944252,12317.tsv,2008-12-17T03:01:00.000Z,Aberation,djbushdio,"logos are fine, but the menu has many colored ..."
985,8944253,12317.tsv,2008-12-17T03:02:00.000Z,Aberation,djbushdio,same thing when I try to read the movie...


## Preprocess the Data

In [3]:
def build_prompt_response_pairs(df):
    grouped = df.groupby('dialogueID')
    conversations = []

    for _, group in grouped:
        sorted_group = group.sort_values('date')
        turns = sorted_group['text'].tolist()

        for i in range(len(turns) - 1):
            prompt = turns[i]
            response = turns[i+1]
            conversations.append({
                'prompt': prompt,
                'response': response
            })

    return pd.DataFrame(conversations)

In [4]:
processed_df = build_prompt_response_pairs(df)
processed_df

Unnamed: 0,prompt,response
0,Anyone here that got experience with Linux (Ub...,- I have payed with the ati drivers...
1,- I have payed with the ati drivers...,"- my card is a radeon 9550, for reference.."
2,"- my card is a radeon 9550, for reference..",: what kind of clues??
3,: what kind of clues??,- first off fglrx the open source drivers jus...
4,- first off fglrx the open source drivers jus...,does anybody use gdesklets widgets? im trying ...
...,...,...
958,not implementing a database :/,Hello...
959,Hello...,A small question: What would cause both 32bit ...
960,A small question: What would cause both 32bit ...,"Funny thing is, Win7 and XP installs fine"
961,"Funny thing is, Win7 and XP installs fine","Thought that myself, xfact, until I swapped in..."


In [5]:
processed_df["text"] = processed_df.apply(
    lambda row: f"### Prompt:\n{row['prompt']}\n### Response:\n{row['response']}",
    axis=1
)
processed_df

Unnamed: 0,prompt,response,text
0,Anyone here that got experience with Linux (Ub...,- I have payed with the ati drivers...,### Prompt:\nAnyone here that got experience w...
1,- I have payed with the ati drivers...,"- my card is a radeon 9550, for reference..",### Prompt:\n- I have payed with the ati drive...
2,"- my card is a radeon 9550, for reference..",: what kind of clues??,"### Prompt:\n- my card is a radeon 9550, for r..."
3,: what kind of clues??,- first off fglrx the open source drivers jus...,### Prompt:\n: what kind of clues??\n### Respo...
4,- first off fglrx the open source drivers jus...,does anybody use gdesklets widgets? im trying ...,### Prompt:\n- first off fglrx the open sourc...
...,...,...,...
958,not implementing a database :/,Hello...,### Prompt:\nnot implementing a database :/\n#...
959,Hello...,A small question: What would cause both 32bit ...,### Prompt:\nHello...\n### Response:\nA small ...
960,A small question: What would cause both 32bit ...,"Funny thing is, Win7 and XP installs fine",### Prompt:\nA small question: What would caus...
961,"Funny thing is, Win7 and XP installs fine","Thought that myself, xfact, until I swapped in...","### Prompt:\nFunny thing is, Win7 and XP insta..."


In [6]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [7]:
from datasets import Dataset
from transformers import AutoTokenizer

# MODEL_NAME = 'mistralai/Mistral-7B-v0.3'
MODEL_NAME = 'EleutherAI/gpt-neo-1.3B'
MAX_LENGTH = 128

dataset = Dataset.from_pandas(processed_df[['text']])
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

In [8]:
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/963 [00:00<?, ? examples/s]

### Load quantized model

In [9]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)


In [10]:
# for name, _ in model.named_modules():
#     if "attention" in name:
#         print(name)

### Prepare for LoRA

In [11]:
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig

# model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# config for mistral and neo-gpt
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# config for EleutherAI/pythia-1.4b
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["attn.c_attn", "attn.c_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

model = get_peft_model(model, lora_config)

### Training args and Trainer

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    # output_dir="./models/mistral-lora-ubuntu",
    output_dir="./models/gpt-neo",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./logs",
    save_strategy="epoch",
    gradient_accumulation_steps=1,
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
print(model.is_gradient_checkpointing)


True


In [14]:
model.gradient_checkpointing_disable() 
print(model.is_gradient_checkpointing) 

False


In [15]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mst124974[0m ([33mbinit-ait[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,1.6399
200,0.2528
300,0.2199


TrainOutput(global_step=363, training_loss=0.6207414658601619, metrics={'train_runtime': 460.0887, 'train_samples_per_second': 6.279, 'train_steps_per_second': 0.789, 'total_flos': 1.0739008464224256e+16, 'train_loss': 0.6207414658601619, 'epoch': 3.0})

In [16]:
model.save_pretrained("gpt-neo-checkpoint")
tokenizer.save_pretrained("gpt-neo-checkpoint")

('mistral-lora-checkpoint/tokenizer_config.json',
 'mistral-lora-checkpoint/special_tokens_map.json',
 'mistral-lora-checkpoint/vocab.json',
 'mistral-lora-checkpoint/merges.txt',
 'mistral-lora-checkpoint/added_tokens.json',
 'mistral-lora-checkpoint/tokenizer.json')