In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llama-3.1/transformers/8b-instruct/2/model.safetensors.index.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00003-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/LICENSE
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00001-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/README.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/USE_POLICY.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer_config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00004-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/special_tokens_map.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/.gitattributes
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00002-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/gener

# **Installing Required Libraries**

In [2]:
%%capture
!pip install bitsandbytes accelerate peft --upgrade transformers


# **Importing Libraries and Loading the Dataset**

In [3]:
import torch
from datasets import load_from_disk, load_dataset
from transformers import AutoModelForCausalLM,AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model, TaskType, LoraModel
from sklearn.metrics import f1_score
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import TextClassificationPipeline



In [4]:
dataset = load_dataset("csv", data_files="/kaggle/input/multi-lingual-sentiment-analysis/train.csv")

Generating train split: 0 examples [00:00, ? examples/s]

# **Creating a Custom Prompt for Sentiment Analysis and Fixing Labels for Sentiment Classification**

In [5]:
#preprocessing to consider cultural context for better results 
def multilingual_prompt(example):
    custom_prompt = (
        f"Act as an expert cultural linguist and sentiment analyst. You are provided with a sentence in {example['language']}. "
        "Examine the sentence carefully, taking into account local idioms and cultural nuances. Based on your analysis, classify the sentiment as either 'Positive' or 'Negative'. "
        "Respond with only your verdict, without any further explanation.\n\n"
        f"Sentence: {example['sentence']}\n"
        "Sentiment:"
    )
    
    return {
        "conversations": [
            {"from": "human", "value": custom_prompt},
            {"from": "gpt", "value": example["label"]}
        ],
        "language": example["language"],
        "label": example["label"]
    }

dataset = dataset.map(multilingual_prompt)


def fix_labels(example):
    label_map = {"Negative": 0 , "Positive": 1} 
    example["label"] = label_map.get(example["label"], -1)  
    return example

dataset = dataset.map(fix_labels)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

# **Setting Up the Required model and Loading Tokenizer **

In [6]:
model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"

tokenizer = AutoTokenizer.from_pretrained(model_name,model_max_length=1024)

tokenizer.pad_token=tokenizer.eos_token # setting pad token id




In [7]:
def tokenize(example):
    return tokenizer(example["sentence"], padding=True, truncation=True, max_length=512)


In [8]:
tokenized_dataset = dataset.map(tokenize, batched=True, num_proc=4, remove_columns=['sentence'])
print(tokenized_dataset)

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'label', 'language', 'conversations', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})


# **Splitting Dataset for Training and Evaluation**

In [9]:
dataset_split = tokenized_dataset['train'].train_test_split(test_size=0.07,seed=42)

# **Creating Data Collator**

In [10]:
data_col = DataCollatorWithPadding(tokenizer,padding='max_length', max_length=512)

# **Configuring Bits and Bytes for 4-Bit Quantization and Loading Model for Sequence Classification**

In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  #efficient computation
    llm_int8_enable_fp32_cpu_offload=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=2,
                                                           pad_token_id=tokenizer.eos_token_id,
                                                           quantization_config=bnb_config,
                                                           device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/llama-3.1/transformers/8b-instruct/2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Configuring LoRA for Efficient Fine-Tuning**

In [12]:
lora_config = LoraConfig(
    r=32,  
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    lora_alpha=64,  
    lora_dropout=0.1  
)



In [13]:
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 7,518,564,352 || trainable%: 0.1813


In [14]:
lora_model.peft_config

{'default': LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/kaggle/input/llama-3.1/transformers/8b-instruct/2', revision=None, inference_mode=False, r=32, target_modules={'q_proj', 'v_proj'}, exclude_modules=None, lora_alpha=64, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)}

# **Setting Up the Training Arguments**

In [15]:
training_args = TrainingArguments(
    output_dir='lora_llama_multilingual',
    eval_strategy="steps",
    eval_steps=100,
    num_train_epochs=3,  
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,
    bf16=False,
    fp16=True, #memory usage and speed up training.
    tf32=False,
    gradient_accumulation_steps=2,  
    adam_beta1=0.9,
    adam_beta2=0.98,  
    learning_rate=1e-4,  
    weight_decay=0.01,
    logging_dir='logs',
    logging_strategy="steps",
    logging_steps=100,
    save_steps=100,
    save_total_limit=3, 
    report_to='none',
)

# **Defining Evaluation Metric (F1 Score)**

In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average="weighted")  
    return {"f1": f1}

# **Training the Model**

In [17]:
trainer = Trainer(model=lora_model,
                  args = training_args,
                 train_dataset=dataset_split["train"],
                 eval_dataset=dataset_split["test"],
                 data_collator = data_col,
                  compute_metrics=compute_metrics 
                 )



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [18]:
results = trainer.train()

Step,Training Loss,Validation Loss,F1
100,0.755,0.277595,0.914569
200,0.2715,0.261194,0.871985
300,0.0992,0.422598,0.871669


# **Loading Test Data and Making Predictions**

In [19]:
test_dataset = load_dataset("csv", data_files="/kaggle/input/multi-lingual-sentiment-analysis/test.csv")


Generating train split: 0 examples [00:00, ? examples/s]

In [20]:
classifier = TextClassificationPipeline(model=model,
                                       tokenizer=tokenizer,
                                       framework='pt',
                                       task="sentiment-analysis"
                                       )

Device set to use cuda:0


In [21]:
model.config.id2label = {0:"Negative",1:"Positive"}

In [22]:
testing = test_dataset['train'][0]['sentence']
prediction = classifier(testing)
print(prediction)

[{'label': 'Positive', 'score': 0.8625832796096802}]


# **Generating Submission File**

In [23]:
test_texts = test_dataset["train"]["sentence"]  
test_ids = test_dataset["train"]["ID"]  

In [24]:
predictions = classifier(test_texts, batch_size=32)
predicted_labels = [pred["label"] for pred in predictions]

In [25]:
output_dataset = pd.DataFrame({"ID": test_ids, "label": predicted_labels})
output_dataset.to_csv("submission.csv", index=False)