In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch
from google.colab import drive
import os

In [None]:
drive.mount('/content/drive')
output_dir = "/content/drive/MyDrive/sarvam-transliteration-sentence"

Mounted at /content/drive


In [None]:
df = pd.read_json("transliteration_dataset.json")
dataset = Dataset.from_pandas(df)
df_val = pd.read_json("validation_data.json")
val_dataset = Dataset.from_pandas(df_val)

In [None]:
def format_prompt(examples):
    return {
        "text": [f"Transliterate this [{lang_code}]: {sentence}\nOutput: {output}"
                 for sentence, output, lang_code in zip(examples['input'], examples['expected_output'], examples['language'])],
        "labels": examples['expected_output']
    }

dataset = dataset.map(format_prompt, batched=True, remove_columns=["input", "expected_output", "language"])
val_dataset = val_dataset.map(format_prompt, batched=True, remove_columns=["input", "expected_output", "language"])

Map:   0%|          | 0/1097 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
model_name = "sarvamai/sarvam-1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)
    inputs["labels"] = tokenizer(examples["labels"], padding="max_length", truncation=True, max_length=256)["input_ids"]
    return inputs

dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text", "labels"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text", "labels"])

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/775k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.51M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/1097 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.77G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/279M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/193 [00:00<?, ?B/s]

In [None]:
model = get_peft_model(model, config)

In [None]:
training_args = TrainingArguments(
    output_dir="./sarvam-transliteration-sentence",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=5,
    logging_dir="./logs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=5,
    learning_rate=3e-5,
    fp16=True,
    save_total_limit=2,
    report_to="all"
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,
    train_dataset=dataset
)



In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabhavana0410[0m ([33mabhavana0410-ssn-coe[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
0,6.0324,No log
1,0.3404,No log
2,0.2125,No log
3,0.1636,No log


In [None]:
wand db api key : 5c471cef12c538328502d01d53d802a88b28f88c

In [None]:
trainer.save_model(output_dir)

# tokenizer = AutoTokenizer.from_pretrained("./sarvam-transliteration-sentence")
# model = AutoModelForCausalLM.from_pretrained("./sarvam-transliteration-sentence")

In [None]:
model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(68096, 2048)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_features=2048,

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
import torch

def transliterate(sentence, lang_code="hin"):
    input_text = f"Transliterate this [{lang_code}]: {sentence}\nOutput:"
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        output = model.generate(**inputs, max_length=100)

    return tokenizer.decode(output[0], skip_special_tokens=True).split("Output:")[-1].strip()

test_sentences = [
    ("04-07-2023 तक, ₹ 3000 खर्च किए गए।", "hin"),
    ("அவன் 10km தூரம் ஓடினான்.", "tam"),
    ("05/06/2018 அன்று ₹7500 செலவழிக்கப்பட்டது.", "tam"),


]

for sentence, lang in test_sentences:
    print(f"Input: {sentence} → Transliteration: {transliterate(sentence, lang)}")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Input: 04-07-2023 तक, ₹ 3000 खर्च किए गए। → Transliteration: फोर सेवन टू थ्री हंड्रेड ट्वेंटी थ्री खर्च किए गए।


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Input: அவன் 10km தூரம் ஓடினான். → Transliteration: அவர் டென் கிலோமீட்டர் ஓடினார்.
Input: 05/06/2018 அன்று ₹7500 செலவழிக்கப்பட்டது. → Transliteration: ஃபைவ் சிக்ஸ் டூ ஹண்ட்ரெட் எய்ட் த்ரீ டூ ஹண்ட்ரெட் எய்ட் யூரோஸ் ஸ்பென்ட்.
