In [1]:
# Load model directly
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch 

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-7b-base")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # More aggressive quantization
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True  
)

model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-llm-7b-base",
                                             quantization_config=bnb_config, 
                                             device_map="auto",
                                             offload_folder="offload_folder",)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:19<00:00,  9.55s/it]


In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=32,
    lora_dropout=0.05
)


In [4]:
from peft import get_peft_model

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


In [5]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imdb")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [7]:
dataset["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [8]:
def tokenize_function(examples):
    inputs = tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=256
    )
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████| 25000/25000 [00:17<00:00, 1409.33 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████| 25000/25000 [00:16<00:00, 1474.74 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████| 50000/50000 [00:33<00:00, 1509.80 examples/s]


In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50000
    })
})

In [10]:
# Subset the dataset for faster experimentation
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(20))

In [11]:
small_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [13]:
# Print a sample tokenized entry
#print("Tokenized Sample:")
#print(small_train_dataset[0])

In [15]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



training_args = TrainingArguments(
    output_dir="/deepseek_finetuned",
    num_train_epochs=50,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    eval_strategy="steps", 
    eval_steps=10,
    learning_rate=3e-5,
    logging_dir="/logs",
    report_to="wandb",  
    run_name="DeepSeek_FineTuning_Experiment",
    load_best_model_at_end=True,  # This is needed for early stopping
    metric_for_best_model="eval_loss",  # Use whatever metric you want to optimize
    greater_is_better=False,  # Set to True if higher is better (like accuracy)
)

# Add early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5,  # Number of evaluation steps with no improvement after which training will be stopped
    early_stopping_threshold=0.0001  # Minimum change to qualify as improvement
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
    callbacks=[early_stopping_callback]  # Add the early stopping callback
)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
torch.cuda.empty_cache()
print("Cleared CUDA Cache")

Cleared CUDA Cache


In [17]:
trainer.train()

Step,Training Loss,Validation Loss
10,5.0977,5.316971
20,4.5,4.857128
30,3.8359,3.866419
40,3.2259,3.014825
50,2.7949,2.694044
60,2.5389,2.436818
70,2.3198,2.318872
80,2.3563,2.285219
90,2.336,2.27366
100,2.2614,2.266988


TrainOutput(global_step=150, training_loss=2.8471548716227213, metrics={'train_runtime': 1932.4744, 'train_samples_per_second': 2.587, 'train_steps_per_second': 0.078, 'total_flos': 3.757283597510246e+16, 'train_loss': 2.8471548716227213, 'epoch': 37.64})

In [18]:
trainer.save_model("./fine_tuned_deepseek")
tokenizer.save_pretrained("./fine_tuned_deepseek")
print("Fine-Tuned Model Saved Successfully!")

✅ Fine-Tuned Model Saved Successfully!
