In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="Your file here")
dataset = dataset["train"].train_test_split(test_size=0.1)

In [None]:
from transformers import AutoTokenizer
from huggingface_hub import login

login(token="HF_TOKEN")

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def format_example(example):
    prompt = f"[INST] Classify the following political statement as 'Left-Leaning', 'Right-Leaning', or 'Neutral':\n\"{example['whisper_voice_to_text']}\" [/INST]"
    target = f" {example['manual_leaning']}</s>"
    return {"text": f"<s>{prompt}{target}"}

formatted_dataset = dataset.map(format_example)


In [None]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = formatted_dataset.map(tokenize, batched=True)


In [None]:
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
import torch

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    task_type=TaskType.CAUSAL_LM,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, peft_config)


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./mistral-leaning-model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    #evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=2e-5,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./mistral-leaning-model")
tokenizer.save_pretrained("./mistral-leaning-model")