In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/multi-lingual-sentiment-analysis/sample_submission.csv
/kaggle/input/multi-lingual-sentiment-analysis/train.csv
/kaggle/input/multi-lingual-sentiment-analysis/test.csv
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model.safetensors.index.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00003-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/LICENSE
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00001-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/README.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/USE_POLICY.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer_config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00004-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/special_tokens_map.json
/kaggle/input/llama-3.1/

In [None]:
%%capture
# Installing important libraries. 
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [None]:
# This code sets up and loads a language model with 4-bit quantization, enabling efficient memory usage for large models. The model is loaded with a specified maximum sequence length and the option to use 4-bit precision for weights, reducing memory requirements.

from unsloth import FastLanguageModel
import torch
max_seq_len = 1024
dtype = None 
load_in_4bit = True 

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
] 

base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/2", 
    max_seq_length = max_seq_len,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

In [None]:
# This code applies PEFT (Parameter Efficient Fine-Tuning) to a base language model. PEFT is a technique that allows fine-tuning large models with fewer parameters, making it more memory-efficient. The model focuses on fine-tuning specific layers (such as attention projections and other gates) rather than the whole model. The lora_alpha and lora_dropout parameters control the scaling and dropout of the low-rank adaptation. Gradient checkpointing is enabled for memory efficiency, and other parameters like random state are set for reproducibility. The result is a more efficient model fine-tuning setup.

model = FastLanguageModel.get_peft_model(
    base_model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none",    
    use_gradient_checkpointing = "unsloth", 
    random_state = 1024,
    use_rslora = False,  
    loftq_config = None, 
)

In [None]:
from datasets import load_dataset

# Loading all the required CSV files.
dataset_train = load_dataset('csv', data_files="/kaggle/input/multi-lingual-sentiment-analysis/train.csv",split='train')
dataset_test = load_dataset('csv', data_files="/kaggle/input/multi-lingual-sentiment-analysis/test.csv")

In [None]:
# This code defines a function to format text examples into a specific prompt structure, designed for sentiment analysis tasks. It uses an alpaca_prompt template that takes a sentence and its corresponding sentiment label (positive or negative) and formats them for input into the model. The function format_prompts processes a batch of examples by formatting them with the prompt, tokenizing the text using the provided tokenizer, and ensuring that the tokenized outputs are padded and truncated to a fixed sequence length. The formatted and tokenized data (including input IDs and attention masks) is then returned as part of a dictionary. Finally, the map function is applied to the training dataset (dataset_train), processing all the examples in batch and removing unnecessary columns.


alpaca_prompt = """Below is an instruction outlining a task, followed by an input containing additional context. Provide a response that completes the task accordingly.

### Instruction:
Determine whether the sentiment of the following sentence is positive or negative.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def format_prompts(examples):
    formatted_texts = []
    for sentence, label in zip(examples["sentence"], examples["label"]):
        formatted_text = alpaca_prompt.format(sentence, label) + EOS_TOKEN
        formatted_texts.append(formatted_text)

    tokenized_output = tokenizer(
        formatted_texts,
        padding="max_length",  
        truncation=True,  
        max_length=max_seq_len,
        return_tensors="np",  
    )

    return {
        "input_ids": tokenized_output["input_ids"].tolist(),
        "attention_mask": tokenized_output["attention_mask"].tolist(),
        "texts": formatted_texts
    }

dataset = dataset_train.map(format_prompts, batched=True, remove_columns=dataset_train.column_names)

In [None]:
# This code sets up and configures a SFTTrainer from the trl library to fine-tune a model using a training dataset. The SFTTrainer is initialized with the model, tokenizer, and training dataset, specifying the input field (texts) and various hyperparameters. The TrainingArguments define key training configurations, including batch size, gradient accumulation steps, learning rate, and warmup steps. It also sets the precision mode (using fp16 or bf16 based on GPU capabilities), optimizer type (AdamW with 8-bit precision), and the learning rate scheduler type (linear decay). The training process will run for 20 steps, and logging is enabled at every step. Additionally, the training is configured for reproducibility with a fixed random seed and no external reporting. This setup enables efficient model fine-tuning on the given dataset.

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "texts",
    max_seq_length = max_seq_len,
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 20,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 1024,
        report_to = "none", 
    ),
)


In [None]:
# Training the model. 

result = trainer.train()

In [None]:
# Reading the test file for the final time. 

test_df = pd.read_csv('/kaggle/input/multi-lingual-sentiment-analysis/test.csv')
test_df.head(5)

In [None]:
# This code performs sentiment analysis on a test dataset (test_df) using a fine-tuned model. For each sentence in the test data, it formats the input into a specific prompt structure, which is tokenized and sent to the model for inference. The model generates a response based on the sentiment analysis task, with a maximum of 200 new tokens produced. The generated output is then decoded and processed to extract the sentiment, specifically the part after the "### Response:" section. If no response is found in the expected format, it simply returns the generated text. The sentiment is cleaned by removing unwanted tokens and stored in the predictions list. Finally, this process helps generate sentiment labels (positive/negative) for each sentence in the test set.

import pandas as pd
FastLanguageModel.for_inference(model) 

predictions = []  

for sentence in test_df['sentence']:
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                f"Analyze the sentiment of the sentence: {sentence}", 
                "",  
            )
        ],
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=200,  
        use_cache=True,
    )

    generated_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    if "### Response:" in generated_output:
        sentiment = generated_output.split("### Response:")[-1].strip()
    else:
        sentiment = generated_output.strip() 

    sentiment = sentiment.replace("<|eot_id|>", "").strip()
    predictions.append(sentiment)

In [None]:
# Accumulating the final results into the dataframe.

submission_data = pd.DataFrame({
    "ID": test_df.index + 1,  
    "label": predictions 
})

submission_data.to_csv("/kaggle/working/submission.csv", index=False)

print("The submission file has been successfully saved as submission.csv")

In [None]:
submission_data.head(10)