In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/multi-lingual-sentiment-analysis/sample_submission.csv
/kaggle/input/multi-lingual-sentiment-analysis/train.csv
/kaggle/input/multi-lingual-sentiment-analysis/test.csv
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model.safetensors.index.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00003-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/LICENSE
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00001-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/README.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/USE_POLICY.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer_config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00004-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/special_tokens_map.json
/kaggle/input/llama-3.1/

In [2]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_len = 1024
dtype = None 
load_in_4bit = True 

# fourbit pre quantized models support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
] 

base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/2", 
    max_seq_length = max_seq_len,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

/kaggle/input/llama-3.1/transformers/8b-instruct/2 does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


In [4]:
model = FastLanguageModel.get_peft_model(
    base_model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none",    
    use_gradient_checkpointing = "unsloth", # 4x longer contexts auto supported!
    random_state = 1024,
    use_rslora = False,  # Supports rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.2.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
from datasets import load_dataset

# Loading CSV files 
dataset_train = load_dataset('csv', data_files="/kaggle/input/multi-lingual-sentiment-analysis/train.csv",split='train')
dataset_test = load_dataset('csv', data_files="/kaggle/input/multi-lingual-sentiment-analysis/test.csv")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
# Defining Alpaca-style prompt template
alpaca_prompt = """Below is an instruction outlining a task, followed by an input containing additional context. Provide a response that completes the task accordingly.

### Instruction:
Determine whether the sentiment of the following sentence is positive or negative.

### Input:
{}

### Response:
{}"""

# Ensures EOS token is present
EOS_TOKEN = tokenizer.eos_token

def format_prompts(examples):
    formatted_texts = []
    for sentence, label in zip(examples["sentence"], examples["label"]):
        formatted_text = alpaca_prompt.format(sentence, label) + EOS_TOKEN
        formatted_texts.append(formatted_text)

    # Tokenize with padding & truncation to ensure uniform length
    tokenized_output = tokenizer(
        formatted_texts,
        padding="max_length",  # Ensures consistent batch processing
        truncation=True,  
        max_length=max_seq_len,
        return_tensors="np",  # Convert to NumPy for batch compatibility
    )

    return {
        "input_ids": tokenized_output["input_ids"].tolist(),
        "attention_mask": tokenized_output["attention_mask"].tolist(),
        "texts": formatted_texts
    }

# Apply batch tokenization to the dataset
dataset = dataset_train.map(format_prompts, batched=True, remove_columns=dataset_train.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "texts",
    max_seq_length = max_seq_len,
    dataset_num_proc = 2,
    packing = False, # makes training approx 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 20,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 1024,
        output_dir = "outputs",
        report_to = "none", 
    ),
)


Applying chat template to train dataset (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
result = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.2651
2,2.1336
3,1.5815
4,1.2039
5,1.2002
6,1.5373
7,1.055
8,0.7344
9,0.9473
10,0.8867


In [9]:
test_df = pd.read_csv('/kaggle/input/multi-lingual-sentiment-analysis/test.csv')
test_df.head(5)

Unnamed: 0,ID,sentence,language
0,1,"1120 mAh, ਓਵਰਚਾਰਜਿੰਗ ਦੀ ਸੁਰੱਖਿਆ",pa
1,2,તે સઘન મોઇશ્ચરાઇઝિંગ પ્રદાન કરે છે અને સરસ સ્વ...,gu
2,3,"1120 ಎಂಎಎಚ್, ಮಿತಿಮೀರಿದ ರಕ್ಷಣೆ",kn
3,4,ভাৰতত নিৰ্মিত সৰ্বশ্ৰেষ্ঠ পাৰফিউম ব্ৰেণ্ডবোৰৰ ...,as
4,5,"میں نے حال ہی میں ""انفولڈ"" سے ایک ٹیمپلیٹ خرید...",ur


In [10]:
import pandas as pd
FastLanguageModel.for_inference(model) 

predictions = []  # List to store generated predictions

# Loop through the sentences in the test dataset
for sentence in test_df['sentence']:
    # Tokenize the input, ensuring proper padding and truncation
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                f"Analyze the sentiment of the sentence: {sentence}",  # Task description
                "",  # Output is left blank for the model to generate
            )
        ],
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to("cuda")

    # Generate the model's response
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=200,  
        use_cache=True,
    )

    # Decode the generated output
    generated_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Parse the sentiment from the generated output
    if "### Response:" in generated_output:
        sentiment = generated_output.split("### Response:")[-1].strip()
    else:
        sentiment = generated_output.strip()  # Fallback for different output formatting

    # Clean up unwanted tokens like <|eot_id|> if present
    sentiment = sentiment.replace("<|eot_id|>", "").strip()

    # Add the extracted sentiment to the list
    predictions.append(sentiment)

In [11]:
# Constructing a DataFrame for the submission with ID and sentiment prediction
submission_data = pd.DataFrame({
    "ID": test_df.index + 1,  # Using the index for ID values (assumed to be 1-based)
    "label": predictions  # The sentiment results generated earlier
})

# Save the DataFrame as a CSV file
submission_data.to_csv("/kaggle/working/submission.csv", index=False)

print("The submission file has been successfully saved as submission.csv")

The submission file has been successfully saved as submission.csv


In [12]:
submission_data.head(10)

Unnamed: 0,ID,label
0,1,Negative
1,2,Positive
2,3,Positive
3,4,Positive
4,5,Negative
5,6,Negative
6,7,Positive
7,8,Negative
8,9,Negative
9,10,Positive
