# 1. Environment Setup

In [None]:
!pip install torch==2.5.1
!pip install transformers datasets accelerate bitsandbytes
!pip install unsloth
!pip install peft

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.5.1)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.1)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.5.1)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torc



In [None]:
import torch
from datasets import Dataset
from unsloth import FastLanguageModel
import pandas as pd
from datetime import datetime
from transformers import TrainingArguments, Trainer

# Verify GPU
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
CUDA Available: True
GPU Device: Tesla T4


# 2. Prepare Malayalam Colloquial Dataset

In [None]:
import pandas as pd

# Dataset
data = {
    "English Text": [
        "Drink more water to stay hydrated.", "Symptoms of this disease include fever and cough.",
        "There is a direct flight to Dubai.", "Check-in for my flight starts at 6 PM.",
        "Cloud computing allows data storage remotely.", "Fixing internet connection issues requires troubleshooting.",
        "Checking bank balance is possible via mobile app.", "Credit cards offer cashback on purchases.",
        "Can we change the key in a dictionary?", "How do you learn AI?",
        "Can you review my document?", "Do we have a feedback session today?",
        "Do you accept UPI payments?", "Where is the nearest metro station?",
        "Take medicine on time.", "Are you fine?", "What is your name?",
        "Where do you live?", "What do you want?", "Shall I help you?"
    ],
    "Colloquial Text": [
        "Hydration nu koodi vellam kudikkuka.", "Ee rogam symptoms fever um cough um aanu.",
        "Dubai ku direct flight undu.", "Ente flight ku check-in 6 PM nu thudangum.",
        "Cloud computing il remote aayi data store cheyyam.", "Internet connection problem solve cheyyan troubleshooting venam.",
        "Bank balance mobile app il check cheyyam.", "Credit card il purchases nu cashback kittum.",
        "Dictionary key change cheyyamo?", "AI padikkan engane?",
        "Ente document review cheyyamo?", "Innu feedback session undo?",
        "UPI accept cheyyunno?", "Ividae metro station evideya?",
        "Medicine time ku edukkanam.", "Ninte sadhanam theerumo?", "Ninte peru entha?",
        "Nee evideya thamasikkunnathu?", "Ninte entha avashyam?", "Njan ninne sahayikkatte?"
    ],
    "Language": ["Malayalam"] * 20
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save as CSV file
df.to_csv("mistral-malayalam-dataset.csv", index=False)

print("Dataset saved successfully!")


Dataset saved successfully!


In [None]:
# Load and display the first few rows of the dataset
df = pd.read_csv("mistral-malayalam-dataset.csv")
print(df.head())  # Show first 5 rows
print(f"Total Rows: {len(df)}")  # Confirm 200 rows


                                        English Text  \
0                 Drink more water to stay hydrated.   
1  Symptoms of this disease include fever and cough.   
2                 There is a direct flight to Dubai.   
3             Check-in for my flight starts at 6 PM.   
4      Cloud computing allows data storage remotely.   

                                     Colloquial Text   Language  
0               Hydration nu koodi vellam kudikkuka.  Malayalam  
1          Ee rogam symptoms fever um cough um aanu.  Malayalam  
2                       Dubai ku direct flight undu.  Malayalam  
3         Ente flight ku check-in 6 PM nu thudangum.  Malayalam  
4  Cloud computing il remote aayi data store chey...  Malayalam  
Total Rows: 20


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load dataset from CSV
dataset = load_dataset("csv", data_files="mistral-malayalam-dataset.csv")

# Print column names to verify
print("Dataset Columns:", dataset["train"].column_names)

# Load tokenizer
MODEL_NAME = "mistralai/Mistral-7b-v0.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.pad_token = tokenizer.eos_token


# Define tokenization function with correct column name
def tokenize_function(examples):
    return tokenizer(examples["English Text"],
                     padding="max_length",
                     truncation=True,
                     max_length=256)

# Split into train and test
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Extract train and test datasets
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

print("Dataset Splitting and Tokenization Complete!")


Dataset Columns: ['English Text', 'Colloquial Text', 'Language']


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Dataset Splitting and Tokenization Complete!


In [None]:
from datasets import Dataset

# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split into training & validation sets
dataset = dataset.train_test_split(test_size=0.1)

# View dataset format
dataset

DatasetDict({
    train: Dataset({
        features: ['English Text', 'Colloquial Text', 'Language'],
        num_rows: 18
    })
    test: Dataset({
        features: ['English Text', 'Colloquial Text', 'Language'],
        num_rows: 2
    })
})

In [None]:
print(tokenized_dataset.column_names)
print(tokenized_dataset["train"][0])
# View tokenized dataset
tokenized_dataset

{'train': ['English Text', 'Colloquial Text', 'Language', 'input_ids', 'attention_mask'], 'test': ['English Text', 'Colloquial Text', 'Language', 'input_ids', 'attention_mask']}
{'English Text': 'Do you accept UPI payments?', 'Colloquial Text': 'UPI accept cheyyunno?', 'Language': 'Malayalam', 'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2

DatasetDict({
    train: Dataset({
        features: ['English Text', 'Colloquial Text', 'Language', 'input_ids', 'attention_mask'],
        num_rows: 16
    })
    test: Dataset({
        features: ['English Text', 'Colloquial Text', 'Language', 'input_ids', 'attention_mask'],
        num_rows: 4
    })
})

#3. Load Pretrained Mistral Model & Tokenizer

In [None]:
import torch
from unsloth import FastLanguageModel

MODEL_NAME = "mistralai/Mistral-7b-v0.1"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=256,
    dtype=torch.bfloat16,
    load_in_4bit=True,
    trust_remote_code=True
)

Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.2.15: Fast Mistral patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Device does not support bfloat16. Will change to float16.


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

# 4. Apply LoRA for Parameter-Efficient Fine-Tuning (PEFT)

In [None]:
from peft import LoraConfig, get_peft_model
# Add LoRA adapters
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
model.print_trainable_parameters()


In [None]:
print(tokenized_dataset)

# 5. Define Training Arguments & Trainer

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./mistral-finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Mixed precision training
    push_to_hub=False  # Set to True if uploading to Hugging Face
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

trainer.train()


In [None]:
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
model.save_pretrained("fine_tuned_mistral")
tokenizer.save_pretrained("fine_tuned_mistral")

# 6. Load Fine-Tuned Model & Perform Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load fine-tuned model
model_name = "fine_tuned_mistral"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Move model to GPU (if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


In [None]:
from huggingface_hub import notebook_login
notebook_login()


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "mistralai/Mistral-7b-v0.1"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load the model with offloading handling
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto",          # Auto-assign device
    offload_folder="offload",   # Ensure model loads properly
)

# Get model's actual max sequence length
MAX_LENGTH = model.config.max_position_embeddings


In [None]:
def translate_to_malayalam_colloquial(text):
    prompt = f"""### Human: You are a Malayalam colloquial language translator. Translate the following English text to Malayalam colloquial language (spoken Malayalam).
Here are some examples:
"What is this?" -> "Ithu entha?"
"How are you?" -> "Ninte sugalle?"
"Where are you going?" -> "Evideya pokunnathu?"

Now translate this: {text}
### Assistant: """

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move to model's device

    outputs = model.generate(
        **inputs,
        max_length=MAX_LENGTH,
        do_sample=True,
        temperature=0.7,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        num_beams=1,
        top_p=0.95,
        repetition_penalty=1.2
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Assistant:")[-1].strip()


In [None]:
print("\nTesting the model:\n")
for sentence in test_sentences:
    translation = translate_to_malayalam_colloquial(sentence)
    print(f"English: {sentence}")
    print(f"Malayalam Colloquial: {translation}")
    print("-" * 50)


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
MODEL_NAME = "mistralai/Mistral-7b-v0.1"  # Replace with your model checkpoint
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")

MAX_LENGTH = 256  # Define max token length

def translate_to_malayalam_colloquial(text):
    prompt = f"""### Human: You are a Malayalam colloquial language translator. Translate the following English text to Malayalam colloquial language (spoken Malayalam).
Here are some examples:
"What is this?" -> "Ithu entha?"
"How are you?" -> "Ninte sugalle?"
"Where are you going?" -> "Evideya pokunnathu?"

Now translate this: {text}
### Assistant: """
    MAX_LENGTH = model.config.max_position_embeddings  # Get the model's actual max length

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_length=MAX_LENGTH,
        do_sample=True,
        temperature=0.7,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        num_beams=1,
        top_p=0.95,
        repetition_penalty=1.2
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Assistant:")[-1].strip()

# Test the model
test_sentences = [
    "What is your name?",
    "What is machine learning?",
    "How does this work?",
    "Can you explain this to me?",
    "Where can I find the documentation?",
    "What is the error in this code?"
]

print("\nTesting the model:\n")
for sentence in test_sentences:
    translation = translate_to_malayalam_colloquial(sentence)
    print(f"English: {sentence}")
    print(f"Malayalam Colloquial: {translation}")
    print("-" * 50)


# 7. Push Model to Hugging Face Hub

In [None]:
from huggingface_hub import login

login()  # Follow on-screen instructions

model.push_to_hub("Ashimariam/mistral-malayalam-new")
tokenizer.push_to_hub("Ashimariam/mistral-malayalam-new")


In [None]:
!pip install huggingface_hub datasets

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import Dataset
from huggingface_hub import HfApi

# Define your Hugging Face dataset repo name
dataset_repo = "Ashimariam/mistral-malayalam-dataset"

# Load the CSV file
dataset = Dataset.from_pandas(pd.read_csv("mistral-malayalam-dataset.csv"))

# Push dataset to Hugging Face
dataset.push_to_hub(dataset_repo)

print(f"Dataset uploaded successfully: https://huggingface.co/datasets/{dataset_repo}")
