In [None]:
# Install required packages
!pip install -q transformers datasets peft accelerate bitsandbytes evaluate
!pip install -q transformers datasets peft accelerate bitsandbytes trl tiktoken
!pip install -q astor
!pip install -U transformers


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import torch
import ast
import astor
import re
import random
from sklearn.model_selection import train_test_split, StratifiedKFold
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
import numpy as np
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

print("All imports done")


All imports done


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load and prepare data with better preprocessing
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/112_time_complexity_dataset.csv")
df = df.drop(columns=[col for col in df.columns if "Unnamed" in col])
df = df.rename(columns={"code_snippet": "input", "time_complexity": "output"})
df = df.dropna()

In [None]:
# Clean and normalize the data
def clean_code(code):
    """Clean and normalize code snippets"""
    # Remove excessive whitespace
    code = re.sub(r'\n\s*\n', '\n', code)
    # Normalize indentation
    lines = code.split('\n')
    if lines:
        # Remove empty lines at start/end
        while lines and not lines[0].strip():
            lines.pop(0)
        while lines and not lines[-1].strip():
            lines.pop()
        code = '\n'.join(lines)
    return code.strip()


In [None]:
def normalize_complexity(complexity):
    """Normalize complexity notation"""
    complexity = complexity.strip()
    # Standardize common variations
    complexity = re.sub(r'O\s*\(\s*', 'O(', complexity)
    complexity = re.sub(r'\s*\)', ')', complexity)
    return complexity

df['input'] = df['input'].apply(clean_code)
df['output'] = df['output'].apply(normalize_complexity)

print(f"Dataset size: {len(df)}")
print("Class distribution:")
print(df["output"].value_counts())


Dataset size: 1008
Class distribution:
output
O(log n)      112
O(n log n)    112
O(n^2)        112
O(n!)         112
O(2^n)        112
O(1)          112
O(n^3)        112
O(n)          112
O(sqrt(n))    112
Name: count, dtype: int64


In [None]:
# Enhanced augmentation with better strategies
def enhanced_augment_code(code, strategy="rename"):
    """Multiple code augmentation strategies"""
    try:
        tree = ast.parse(code)

        if strategy == "rename_function":
            for node in ast.walk(tree):
                if isinstance(node, ast.FunctionDef):
                    node.name = f"modified_{node.name}"

        elif strategy == "rename_variables":
            # Simple variable renaming
            var_mapping = {}
            for node in ast.walk(tree):
                if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Store):
                    if len(node.id) > 1 and not node.id.startswith('_'):
                        var_mapping[node.id] = f"var_{node.id}"

            for node in ast.walk(tree):
                if isinstance(node, ast.Name) and node.id in var_mapping:
                    node.id = var_mapping[node.id]

        elif strategy == "add_comments":
            # Add a comment by modifying function name
            for node in ast.walk(tree):
                if isinstance(node, ast.FunctionDef):
                    node.name = f"optimized_{node.name}"

        return astor.to_source(tree)
    except:
        # If AST parsing fails, use simple string modifications
        if strategy == "rename_function":
            return re.sub(r'def (\w+)', r'def modified_\1', code)
        elif strategy == "add_comments":
            return re.sub(r'def (\w+)', r'def optimized_\1', code)
        return code

In [None]:
# Smart data augmentation with class balancing
def create_balanced_dataset(df, target_samples_per_class=200):
    """Create a balanced dataset with intelligent augmentation"""
    augmented_data = []
    strategies = ["rename_function", "rename_variables", "add_comments"]

    for complexity_class in df["output"].unique():
        class_data = df[df["output"] == complexity_class].copy()
        current_count = len(class_data)

        # Add all original samples
        for _, row in class_data.iterrows():
            augmented_data.append({
                "input": row["input"],
                "output": row["output"]
            })

        # Calculate how many augmented samples needed
        target_count = min(target_samples_per_class, current_count * 3)
        augment_needed = max(0, target_count - current_count)

        # Generate augmented samples
        for i in range(augment_needed):
            original_row = class_data.iloc[i % len(class_data)]
            strategy = strategies[i % len(strategies)]

            augmented_code = enhanced_augment_code(original_row["input"], strategy)
            augmented_data.append({
                "input": augmented_code,
                "output": original_row["output"]
            })

    return pd.DataFrame(augmented_data).drop_duplicates().reset_index(drop=True)

# Create balanced dataset
print("Creating balanced dataset...")
df_balanced = create_balanced_dataset(df, target_samples_per_class=180)
print(f"Balanced dataset size: {len(df_balanced)}")
print("Balanced class distribution:")
print(df_balanced["output"].value_counts())

# Optimized train/validation/test split
train_df, temp_df = train_test_split(
    df_balanced,
    test_size=0.3,
    random_state=42,
    stratify=df_balanced["output"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df["output"]
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Convert to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Load model with optimizations
model_ckpt = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_ckpt,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

# Optimized LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,  # Increased rank for better capacity
    lora_alpha=32,  # Balanced alpha
    lora_dropout=0.05,  # Reduced dropout
    target_modules=["q", "v", "k", "o"],  # Target key attention modules
)

model = get_peft_model(model, peft_config)
print(f"Trainable parameters: {model.get_nb_trainable_parameters()}")


Creating balanced dataset...
Balanced dataset size: 1548
Balanced class distribution:
output
O(sqrt(n))    180
O(n^3)        177
O(n!)         175
O(2^n)        174
O(1)          173
O(n log n)    173
O(log n)      168
O(n)          165
O(n^2)        163
Name: count, dtype: int64
Train: 1083, Val: 232, Test: 233


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]



Trainable parameters: (1179648, 61671936)


In [None]:
# Optimized preprocessing with format consistency
def optimized_preprocess(example):
    """Optimized preprocessing with better formatting"""
    # Consistent input format
    input_text = f"Analyze time complexity: {example['input']}"
    target_text = example['output']

    # Tokenize input with optimal settings
    input_encoding = tokenizer(
        input_text,
        truncation=True,
        padding=False,  # Dynamic padding is more efficient
        max_length=320,  # Slightly increased for better context
        return_tensors=None
    )

    # Tokenize target
    target_encoding = tokenizer(
        target_text,
        truncation=True,
        padding=False,
        max_length=16,  # Sufficient for complexity notation
        return_tensors=None
    )

    # Prepare labels with proper masking
    input_encoding["labels"] = [
        (token_id if token_id != tokenizer.pad_token_id else -100)
        for token_id in target_encoding["input_ids"]
    ]

    return input_encoding

# Tokenize datasets efficiently
print("Tokenizing datasets...")
tokenized_train = train_dataset.map(optimized_preprocess, batched=False, num_proc=1)
tokenized_val = val_dataset.map(optimized_preprocess, batched=False, num_proc=1)
tokenized_test = test_dataset.map(optimized_preprocess, batched=False, num_proc=1)

# Optimized data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    pad_to_multiple_of=8  # Efficient for GPU
)


Tokenizing datasets...


Map:   0%|          | 0/1083 [00:00<?, ? examples/s]

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

Map:   0%|          | 0/233 [00:00<?, ? examples/s]

In [None]:
# Enhanced metrics computation
def compute_enhanced_metrics(eval_pred):
    """Enhanced metrics with better accuracy calculation"""
    predictions, labels = eval_pred

    # Handle tuple predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Decode predictions
    predicted_ids = np.argmax(predictions, axis=-1)
    decoded_preds = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)

    # Decode labels
    labels_filtered = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels_filtered, skip_special_tokens=True)

    # Clean and normalize for comparison
    def clean_prediction(text):
        text = text.strip().lower()
        # Remove extra spaces and normalize
        text = re.sub(r'\s+', ' ', text)
        # Extract O(...) pattern if present
        match = re.search(r'o\([^)]+\)', text)
        if match:
            return match.group().upper()
        return text.upper()

    cleaned_preds = [clean_prediction(pred) for pred in decoded_preds]
    cleaned_labels = [clean_prediction(label) for label in decoded_labels]

    # Calculate metrics
    correct = sum(1 for p, l in zip(cleaned_preds, cleaned_labels) if p == l)
    total = len(cleaned_preds)

    # Per-class metrics
    per_class_stats = defaultdict(lambda: {"correct": 0, "total": 0})
    for pred, label in zip(cleaned_preds, cleaned_labels):
        per_class_stats[label]["total"] += 1
        if pred == label:
            per_class_stats[label]["correct"] += 1

    # Build metrics dictionary
    metrics = {"accuracy": correct / total if total > 0 else 0.0}

    # Add per-class accuracies
    for class_name, stats in per_class_stats.items():
        if stats["total"] > 0:
            class_acc = stats["correct"] / stats["total"]
            clean_class_name = re.sub(r'[^a-zA-Z0-9]', '_', class_name)
            metrics[f"acc_{clean_class_name}"] = class_acc

    # Macro average
    class_accuracies = [
        stats["correct"] / stats["total"]
        for stats in per_class_stats.values()
        if stats["total"] > 0
    ]
    metrics["macro_avg_accuracy"] = np.mean(class_accuracies) if class_accuracies else 0.0

    return metrics

In [None]:
 #Optimized training arguments
training_args = TrainingArguments(
    output_dir="./codet5-optimized",
    eval_strategy="steps",
    eval_steps=50,  # More frequent evaluation
    save_strategy="steps",
    save_steps=50,
    per_device_train_batch_size=12,  # Increased batch size
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,  # Effective batch size = 24
    num_train_epochs=15,  # Reduced epochs but more efficient
    warmup_steps=100,
    learning_rate=2e-4,  # Optimized learning rate for LoRA
    weight_decay=0.01,
    logging_steps=25,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="none",
    dataloader_num_workers=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if available
    dataloader_pin_memory=True,
    group_by_length=True,  # Efficient batching
    lr_scheduler_type="cosine_with_restarts",
    seed=42,
    optim="adamw_torch",
)


In [None]:
# Create trainer with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_enhanced_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Train with optimizations
print("Starting optimized training...")
print(f"Device: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU'}")

trainer.train()


Starting optimized training...
Device: CPU


Step,Training Loss,Validation Loss,Accuracy,Acc O N,Acc O 2 N,Acc O N 3,Acc O Sqrt N,Acc O N Log N,Acc O Log N,Acc O N 2,Acc O N.1,Acc O 1,Macro Avg Accuracy
50,0.1211,0.07083,0.823276,0.884615,0.730769,0.923077,0.962963,0.769231,0.88,0.4,0.84,1.0,0.821184
100,0.1269,0.086317,0.771552,0.807692,0.884615,0.961538,0.962963,0.423077,0.88,0.4,0.6,1.0,0.768876
150,0.1344,0.094662,0.741379,0.923077,0.846154,0.961538,0.962963,0.153846,0.84,0.2,0.84,0.923077,0.738962
200,0.0979,0.058224,0.849138,0.846154,0.730769,0.923077,0.888889,0.807692,0.96,0.64,0.84,1.0,0.848509
250,0.1012,0.055988,0.857759,0.730769,1.0,0.961538,0.962963,0.846154,0.96,0.64,0.64,0.961538,0.855885
300,0.0739,0.048876,0.883621,0.846154,0.961538,0.923077,1.0,0.807692,0.96,0.68,0.76,1.0,0.882051
350,0.0679,0.04507,0.866379,0.769231,0.923077,0.884615,0.962963,0.730769,0.96,0.72,0.84,1.0,0.865628
400,0.0646,0.044463,0.87931,0.846154,1.0,0.846154,0.962963,0.846154,0.96,0.76,0.72,0.961538,0.878107
450,0.0682,0.039413,0.900862,0.884615,1.0,0.846154,0.962963,0.807692,0.96,0.72,0.92,1.0,0.900158
500,0.0693,0.039472,0.896552,0.846154,0.961538,0.923077,0.962963,0.730769,0.96,0.76,0.92,1.0,0.896056


TrainOutput(global_step=690, training_loss=0.08118218632711881, metrics={'train_runtime': 7076.7519, 'train_samples_per_second': 2.296, 'train_steps_per_second': 0.098, 'total_flos': 724482373976064.0, 'train_loss': 0.08118218632711881, 'epoch': 15.0})

In [None]:
# Evaluate on test set
print("\nEvaluating on test set...")
test_results = trainer.evaluate(eval_dataset=tokenized_test)
print("Final Test Results:")
for key, value in test_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")



Evaluating on test set...


Final Test Results:
eval_loss: 0.0579
eval_accuracy: 0.8755
eval_acc_O_N_3_: 1.0000
eval_acc_O_LOG_N_: 0.8400
eval_acc_O_N__: 0.7778
eval_acc_O_2_N_: 0.9231
eval_acc_O_SQRT_N_: 0.9630
eval_acc_O_N_2_: 0.7500
eval_acc_O_N_LOG_N_: 0.7692
eval_acc_O_N_: 0.8400
eval_acc_O_1_: 1.0000
eval_macro_avg_accuracy: 0.8737
eval_runtime: 62.2921
eval_samples_per_second: 3.7400
eval_steps_per_second: 0.2410
epoch: 15.0000


In [None]:
# Save the optimized model
trainer.save_model("./codet5-final")
tokenizer.save_pretrained("./codet5-final")

('./codet5-final/tokenizer_config.json',
 './codet5-final/special_tokens_map.json',
 './codet5-final/vocab.json',
 './codet5-final/merges.txt',
 './codet5-final/added_tokens.json',
 './codet5-final/tokenizer.json')

In [None]:
# Quick inference test
def test_inference(code_snippet):
    """Test the trained model on a code snippet"""
    input_text = f"Analyze time complexity: {code_snippet}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=320)

    # Move input tensors to the same device as the model
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device),
            max_length=16,
            num_beams=3,
            temperature=0.1,
            do_sample=False
        )

    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction.strip()

# Test examples
test_examples = [
    # O(1)
    "def get_first_element(arr):\n    return arr[0]",

    # O(log n)
    "def binary_search(arr, target):\n    low, high = 0, len(arr) - 1\n    while low <= high:\n        mid = (low + high) // 2\n        if arr[mid] == target:\n            return mid\n        elif arr[mid] < target:\n            low = mid + 1\n        else:\n            high = mid - 1\n    return -1",

    # O(n)
    "def find_element(arr, target):\n    for i in arr:\n        if i == target:\n            return True\n    return False",

    # O(n log n)
    "def merge_sort(arr):\n    if len(arr) > 1:\n        mid = len(arr) // 2\n        L = arr[:mid]\n        R = arr[mid:]\n        merge_sort(L)\n        merge_sort(R)\n        i = j = k = 0\n        while i < len(L) and j < len(R):\n            if L[i] < R[j]:\n                arr[k] = L[i]\n                i += 1\n            else:\n                arr[k] = R[j]\n                j += 1\n            k += 1\n        while i < len(L):\n            arr[k] = L[i]\n            i += 1\n            k += 1\n        while j < len(R):\n            arr[k] = R[j]\n            j += 1\n            k += 1",

    # O(n^2)
    "def bubble_sort(arr):\n    n = len(arr)\n    for i in range(n):\n        for j in range(0, n-i-1):\n            if arr[j] > arr[j+1]:\n                arr[j], arr[j+1] = arr[j+1], arr[j]",

    # O(n^3)
    "def triple_loop(arr):\n    n = len(arr)\n    for i in range(n):\n        for j in range(n):\n            for k in range(n):\n                arr[i] += arr[j] + arr[k]",

    # O(n!)
    "def permutations(arr):\n    if len(arr) == 0:\n        return [[]]\n    res = []\n    for i in range(len(arr)):\n        rest = arr[:i] + arr[i+1:]\n        for p in permutations(rest):\n            res.append([arr[i]] + p)\n    return res",

    # O(2^n)
    "def fib(n):\n    if n <= 1:\n        return n\n    return fib(n-1) + fib(n-2)",

    # O(sqrt(n))
    "def is_prime(n):\n    if n <= 1:\n        return False\n    for i in range(2, int(n**0.5) + 1):\n        if n % i == 0:\n            return False\n    return True",





]

print("\nTesting inference:")
for example in test_examples:
    prediction = test_inference(example)
    print(f"Code: {example[:50]}...")
    print(f"Prediction: {prediction}")
    print("-" * 50)

print("\nOptimization Summary:")
print("✓ Enhanced data augmentation with class balancing")
print("✓ Optimized LoRA configuration (r=16)")
print("✓ Better preprocessing and tokenization")
print("✓ Increased batch size with gradient accumulation")
print("✓ Mixed precision training (if GPU available)")
print("✓ Early stopping to prevent overfitting")
print("✓ Cosine learning rate schedule with restarts")
print("✓ Dynamic padding for efficiency")
print("✓ Improved metrics calculation")
print("✓ Reduced epochs with better convergence")

print(f"\nExpected improvements:")
print(f"• Accuracy: 60-75% (vs previous ~45%)")
print(f"• Training time: Similar or faster due to optimizations")
print(f"• Better class balance and performance")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Testing inference:


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Code: def get_first_element(arr):
    return arr[0]...
Prediction: O(1)
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Code: def binary_search(arr, target):
    low, high = 0,...
Prediction: O(log n)
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Code: def find_element(arr, target):
    for i in arr:
 ...
Prediction: O(n)
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Code: def merge_sort(arr):
    if len(arr) > 1:
        ...
Prediction: O(n log n)
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Code: def bubble_sort(arr):
    n = len(arr)
    for i i...
Prediction: O(n^2)
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Code: def triple_loop(arr):
    n = len(arr)
    for i i...
Prediction: O(n^3)
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Code: def permutations(arr):
    if len(arr) == 0:
     ...
Prediction: O(n)
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Code: def fib(n):
    if n <= 1:
        return n
    re...
Prediction: O(2^n)
--------------------------------------------------
Code: def is_prime(n):
    if n <= 1:
        return Fal...
Prediction: O(sqrt(n))
--------------------------------------------------

Optimization Summary:
✓ Enhanced data augmentation with class balancing
✓ Optimized LoRA configuration (r=16)
✓ Better preprocessing and tokenization
✓ Increased batch size with gradient accumulation
✓ Mixed precision training (if GPU available)
✓ Early stopping to prevent overfitting
✓ Cosine learning rate schedule with restarts
✓ Dynamic padding for efficiency
✓ Improved metrics calculation
✓ Reduced epochs with better convergence

Expected improvements:
• Accuracy: 60-75% (vs previous ~45%)
• Training time: Similar or faster due to optimizations
• Better class balance and performance


In [None]:
# If you're using Trainer API
trainer.save_model('/content/drive/MyDrive/finetuned_model_final')
tokenizer.save_pretrained('/content/drive/MyDrive/finetuned_model_final')


('/content/drive/MyDrive/finetuned_model_final/tokenizer_config.json',
 '/content/drive/MyDrive/finetuned_model_final/special_tokens_map.json',
 '/content/drive/MyDrive/finetuned_model_final/vocab.json',
 '/content/drive/MyDrive/finetuned_model_final/merges.txt',
 '/content/drive/MyDrive/finetuned_model_final/added_tokens.json',
 '/content/drive/MyDrive/finetuned_model_final/tokenizer.json')