In [None]:
# !pip install torch transformers peft bitsandbytes accelerate datasets
# !pip install -U bitsandbytes
# !pip install --upgrade huggingface_hub
# !pip install -U bitsandbytes accelerate transformers

In [None]:
from huggingface_hub import login
login()

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import bitsandbytes as bnb

In [None]:
print(bnb.__version__)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,   # use quantization_config
    device_map="auto",
    trust_remote_code=True
)

# Load tokenizer
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)

# # Test inference



In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

In [None]:
# for name, _ in model.named_modules():
#   if "proj" in name:
#     print(name)

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# dataset = load_dataset("imdb", split="train[:100]")

In [None]:
from datasets import Dataset
import pandas as pd

In [None]:
data_frame = pd.read_csv("codemix-hate-small-train.csv")

In [None]:
data_frame

In [None]:
df = data_frame["tweet"]

In [None]:
df = pd.DataFrame(df)

In [None]:
df

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
dataset

In [None]:
def tokenize_function(examples):
  tokens = tokenizer(
      examples["tweet"],
      padding="max_length",            # let collator handle it
      truncation=True,
      max_length=512,
  )
  tokens["labels"] = tokens["input_ids"].copy()
  return tokens

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["tweet"]
)

In [None]:
tokenized_dataset

In [None]:
def mask_pad_labels(batch):
  labels = batch["labels"]
  batch["labels"] = [
      [(
          token if token != tokenizer.pad_token_id else -100
      ) for token in label_seq]
      for label_seq in labels
  ]
  return batch

tokenized_dataset = tokenized_dataset.map(
    mask_pad_labels,
    batched=True
)

In [None]:
tokenized_dataset

In [None]:
# tokenized_dataset["input_ids"]

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir="./Marathi-Mistral-7b/results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=10,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    gradient_checkpointing=True,
    report_to="tensorboard",
    push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
  )

In [None]:
trainer.train()

In [None]:
new_model = "Marathi_Mistral_7-b"

In [None]:
trainer.model.save_pretrained(new_model)

In [None]:
from peft import PeftModel

merged_model = PeftModel.from_pretrained(
    model, new_model
)
merged_model = merged_model.merge_and_unload()

In [None]:
merged_model.save_pretrained(new_model, safe_serialization=True)
tokenizer.save_pretrained(new_model)


In [None]:
# merged_model.push_to_hub(f"atharva77/{new_model}")
tokenizer.push_to_hub(f"atharva77/{new_model}")

# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# from peft import PeftModel

# model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# # 4-bit quantization config
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# # Load model
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,   # use quantization_config
#     device_map="auto",
#     trust_remote_code=True
# )
# model = PeftModel.from_pretrained(base_model, "atharva77/results")

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Your merged model repo on HF
model_name = "atharva77/Marathi_Mistral_7-b"  # change if different

# Load model in 4-bit for inference (optional for smaller GPU usage)
bnb_config = dict(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

# Simple pipeline




In [None]:
model


In [None]:
# # Test inference
query = "Majha naav Atharva ahe!"
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
prompt = f"""
  You are an smart cahtbot that reads user query and translates in plain english.
  You are going to only translate the same sentence as it is.
  Do not include anything else just the query you get.

  Query: {query}
"""
outputs = pipe(prompt, max_new_tokens=200, do_sample=True, temperature=0.8, top_p=0.9)

# base_pipe = pipeline("text-generation", model=base_model, tokenizer=base_tokenizer, device_map="auto")
# base_prompt = f"""
#   You are an smart cahtbot that reads user query and translates in plain english.
#   You are going to only translate the same sentence as it is.
#   Do not include anything else just the query you get.

#   Query: {query}
# """
# base_outputs = base_pipe(base_prompt, max_new_tokens=200, do_sample=True, temperature=0.8, top_p=0.9)

# print("Base Mistral Model"+base_outputs[0]["generated_text"])

print(outputs[0]["generated_text"])
