In [1]:
import pandas as pd

df = pd.read_csv('sample/ubuntu_context_1M.csv')


In [2]:
df

Unnamed: 0.1,Unnamed: 0,dialogueID,date,from,to,text
0,961,148.tsv,2009-03-06T11:17:00.000Z,foul_owl,,"possibly, but i have reinstalled the 180 drive..."
1,962,148.tsv,2009-03-06T11:18:00.000Z,foul_owl,,"not the drivers directly from nvidia, but the ..."
2,963,148.tsv,2009-03-06T11:18:00.000Z,foul_owl,,thanks for your help btw
3,964,148.tsv,2009-03-06T11:19:00.000Z,foul_owl,,is there any way to see WHY the kernel module ...
4,965,148.tsv,2009-03-06T11:19:00.000Z,foul_owl,,"i am not using the hardware drivers manager, i..."
...,...,...,...,...,...,...
999902,9212869,3676.tsv,2012-07-07T20:01:00.000Z,legolas,,:D
999903,9212870,3676.tsv,2012-07-07T20:03:00.000Z,legolas,,is there any irc for opensource softwares?
999904,9212871,3676.tsv,2012-07-07T20:04:00.000Z,MonkeyDust,legolas,try #ubuntu
999905,9212872,3676.tsv,2012-07-07T20:17:00.000Z,MonkeyDust,legolas,= arian


## Preprocess the Data

In [3]:
from collections import defaultdict

# relevant = ['dialogueID','from', 'to', 'text']

def build_prompt_response_pairs(df):
    grouped = df.groupby('dialogueID')
    conversations = []

    for _, group in grouped:
        sorted_group = group.sort_values('date')
        turns = sorted_group['text'].tolist()

        for i in range(len(turns) - 1):
            prompt = turns[i]
            response = turns[i+1]
            conversations.append({
                'prompt': prompt,
                'response': response
            })

    return pd.DataFrame(conversations)

In [4]:
processed_df = build_prompt_response_pairs(df)
processed_df

Unnamed: 0,prompt,response
0,"hey, im pretty new to ubuntu, is there anyway ...",mono
1,mono,"mono is a .net framework, that is. As for whe..."
2,"mono is a .net framework, that is. As for whe...",i going to check it out...:)
3,i going to check it out...:),Check out wine. You'll have to research the v...
4,Check out wine. You'll have to research the v...,"is there some way to make faster open office, ..."
...,...,...
962604,isn't it good practice not to use sudo in a sc...,and what's with the spam today?
962605,and what's with the spam today?,depends on the script i imagine. :)
962606,depends on the script i imagine. :),hack elliotbeken.dyndns.org
962607,hack elliotbeken.dyndns.org,not here please


In [5]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [6]:
import os
from huggingface_hub import login

login(token=os.getenv('HUGGINGFACE'))

: 

## Tokenizing

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM


dataset = Dataset.from_pandas(processed_df)


model_name = 'mistralai/Mistral-7B-v0.3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto', device_map='auto')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
def tokenize(example):
    tokenizer.pad_token = tokenizer.eos_token
    input_text = f"### Prompt:\n{example['prompt']}\n### Response:\n{example['response']}"
    return tokenizer(input_text, truncation=True, padding="max_length", max_length=512)

In [None]:
tokenized_dataset = dataset.map(tokenize)

In [10]:
# tokenized_dataset.save_to_disk('sample/tokenized_ubuntu_1M')

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    
    output_dir='models/',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='logs/',
    fp16=True,
    save_strategy='epoch',
    torch_compile=True,  # Optional for speed
    gradient_checkpointing=True,  # Needed for large models
    optim="paged_adamw_8bit"  # If using bitsandbytes
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset
)

trainer.train()