In [None]:
!pip install unsloth datasets peft trl transformers accelerate bitsandbytes

In [None]:
from datasets import load_dataset

In [None]:
# Inspect the first few lines of the JSONL file
!head -n 5 /kaggle/input/deepseek-fine-tune-v2/deepseek_finetune_dataset.jsonl

In [None]:
import json

input_file = "/kaggle/input/deepseek-fine-tune-v2/deepseek_finetune_dataset.jsonl"
output_file = "/kaggle/working/deepseek_finetune_dataset_cleaned.jsonl"

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for i, line in enumerate(infile):
        try:
            # Replace single quotes with double quotes and parse
            clean_line = line.replace("'", '"')
            parsed = json.loads(clean_line)
            json.dump(parsed, outfile)
            outfile.write("\n")
        except json.JSONDecodeError as e:
            print(f"Skipping line {i} due to: {e}")

In [None]:
import json
import os
import ast

input_file = "/kaggle/input/deepseek-fine-tune-v2/deepseek_finetune_dataset.jsonl"
output_file = "/kaggle/working/deepseek_finetune_dataset_cleaned.jsonl"

processed_lines_count = 0

with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for i, line in enumerate(infile):
        try:
            clean_line = line.strip()
            if not clean_line:
                print(f"Skipping empty line {i}")
                continue

            try:
                parsed = json.loads(clean_line)
            except json.JSONDecodeError:
                # Try to parse with ast.literal_eval
                parsed = ast.literal_eval(clean_line)

            # Ensure string fields
            for key in ["output", "input", "instruction", "prompt", "completion"]:
                if key in parsed and not isinstance(parsed[key], str):
                    parsed[key] = str(parsed[key])

            json.dump(parsed, outfile, ensure_ascii=False)
            outfile.write("\n")
            processed_lines_count += 1

        except Exception as e:
            print(f"Skipping line {i} due to: {e} - Content: {line.strip()[:100]}...")

print(f"Finished cleaning. Successfully processed {processed_lines_count} lines.")


In [None]:
from datasets import load_dataset

# Load the cleaned JSONL dataset
dataset = load_dataset("json", data_files="/kaggle/working/deepseek_finetune_dataset_cleaned.jsonl", split="train")

# Print column names
print(dataset.column_names)

# Print first sample
print(dataset[0])


In [None]:
def format_prompt(example):
    return {
        "text": f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['completion']}"
    }

dataset = dataset.map(format_prompt)


In [None]:
print(dataset[0]["text"])


In [None]:
# Install the required libraries
!pip install unsloth huggingface_hub

# Import the necessary function and login
from huggingface_hub import login
login(token="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")

# Import Unsloth library
from unsloth import FastLanguageModel

# Load a model that's actually available in Unsloth format
# Unsloth has optimized versions of models like Llama, Mistral, etc.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-2-7b-bnb-4bit",  # This is a valid Unsloth model
    max_seq_length=2048,
    load_in_4bit=True,
    use_gradient_checkpointing=True,
)

# Get the PEFT model
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    random_state=42,
)

# Now the model is ready for fine-tuning or inference
print("Model loaded successfully!")

# If you specifically need DeepSeek Coder, you can try using the original model:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# model_name = "deepseek-ai/deepseek-coder-6.7b-base"  # or another DeepSeek model variant
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)

In [None]:
model = FastLanguageModel.get_peft_model(
model,
r=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_alpha=32,
random_state=42,
)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from torch.utils.data import Dataset
import torch

# First, inspect the dataset structure more thoroughly and print detailed info
print("Dataset type:", type(dataset))
print("Dataset keys:", dataset.features.keys())
print("First example structure:")
first_example = dataset[0]
for key, value in first_example.items():
    print(f"{key}: {type(value)} - {value}")

# Define a custom dataset class to ensure proper formatting
class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=2048):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Get the raw text
        text = self.dataset[idx]["text"]

        # Handle tokenization manually to ensure proper structure
        encodings = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"  # Return PyTorch tensors
        )

        # Remove batch dimension that tokenizer adds by default when return_tensors="pt"
        input_ids = encodings["input_ids"].squeeze(0)
        attention_mask = encodings["attention_mask"].squeeze(0)

        # Create the labels (for causal LM, input_ids shifted right)
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Create compute metrics function
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=-1)

    # Flatten for token-level tasks
    predictions = predictions.flatten()
    labels = labels.flatten()

    # Remove ignored index (-100)
    mask = labels != -100
    predictions = predictions[mask]
    labels = labels[mask]

    # Handle empty cases
    if len(labels) == 0:
        return {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}

    return {
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions, average='macro', zero_division=0),
        "recall": recall_score(labels, predictions, average='macro', zero_division=0),
        "f1": f1_score(labels, predictions, average='macro', zero_division=0),
    }

# Ensure tokenizer is properly configured
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Split dataset
split_dataset = dataset.train_test_split(test_size=0.1)
train_raw = split_dataset["train"]
eval_raw = split_dataset["test"]

# Create custom datasets
train_dataset = CustomDataset(train_raw, tokenizer)
eval_dataset = CustomDataset(eval_raw, tokenizer)

# Print an example of processed data to verify structure
print("\nProcessed example structure:")
processed_example = train_dataset[0]
for key, value in processed_example.items():
    print(f"{key}: {type(value)} - Shape: {value.shape}")

# Training arguments
training_args = TrainingArguments(
    output_dir="deepseek_v2_finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-5,
    fp16=True,
    bf16=False,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    push_to_hub=False,
    report_to="none",
    remove_unused_columns=False,
    logging_dir="./logs",
    do_eval=True,
)

# Initialize trainer with our custom dataset
print("\nInitializing trainer...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    # No need for dataset_text_field since we're providing processed data
    args=training_args,
    packing=False,
)

print("\nStarting training...")
# Train the model
trainer.train()