In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llama-3.1/transformers/8b-instruct/2/model.safetensors.index.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00003-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/LICENSE
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00001-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/README.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/USE_POLICY.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer_config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00004-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/special_tokens_map.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/.gitattributes
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00002-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/gener

In [2]:
!pip install transformers accelerate bitsandbytes torch peft datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.2


In [3]:
import pandas as pd
from datasets import Dataset

# Read the CSV data into a DataFrame
df_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/train.csv")

# Map sentiment labels to textual values
df_data["sentiment"] = df_data["label"].map({0: "Negative", 1: "Positive"})

# Convert the DataFrame to a Hugging Face Dataset and split it (80% train, 20% test)
hf_data = Dataset.from_pandas(df_data)
hf_data = hf_data.train_test_split(test_size=0.2)
print(hf_data)


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Define the model directory
model_dir = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"

# Configure 4-bit quantization parameters for efficient loading
quant_params = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Initialize the tokenizer from the model directory
model_tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the causal language model with quantization and automatic GPU assignment
base_model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=quant_params,
    device_map="auto"
)


from peft import LoraConfig, get_peft_model

# Set up LoRA hyperparameters for fine-tuning
lora_params = LoraConfig(
    r=8,                   # LoRA rank
    lora_alpha=32,         # Scaling coefficient
    target_modules=["q_proj", "v_proj"],  # Fine-tune specific attention layers
    lora_dropout=0.05,     # Dropout probability to avoid overfitting
    bias="none",
    task_type="CAUSAL_LM"
)

# Integrate LoRA into the base model for efficient training
finetune_model = get_peft_model(base_model, lora_params)
finetune_model.print_trainable_parameters()

# Ensure the tokenizer uses the EOS token for padding
model_tokenizer.pad_token = model_tokenizer.eos_token

# --- New Data Preprocessing for Instruction Tuning ---
def create_prompt(example):
    # Create the answer string from the sentiment
    answer_str = example["sentiment"]
    
    # Construct the prompt with both the sentence and the answer
    prompt = f"""Classify the sentiment of the following sentence as either 'Positive' or 'Negative'.

Sentence: {example['sentence']}

Answer: {answer_str}"""
    
    # Tokenize the prompt
    tokenized = model_tokenizer(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=256  # Increase max_length if necessary
    )
    # For causal LM training, the labels are the same as the input tokens.
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Apply the prompt creation to all examples in the dataset
tokenized_data = hf_data.map(create_prompt, batched=False)
# Optionally, remove the now-unneeded original columns
tokenized_data = tokenized_data.remove_columns(hf_data["train"].column_names)

# Check one example to ensure everything is aligned
print(tokenized_data["train"][0])


from transformers import TrainingArguments, Trainer

# Set up training configuration parameters with wandb disabled
train_config = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,  # Adjust for available memory
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=3,             # Total training epochs
    weight_decay=0.01,
    logging_dir="./logs",
    push_to_hub=False,
    fp16=True,                    # Use mixed precision training
    report_to=[]                  # Disable wandb (and other external loggers)
)

# Initialize the Trainer object with model, training args, and datasets
trainer_instance = Trainer(
    model=finetune_model,
    args=train_config,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
)

# Execute training
trainer_instance.train()

# Save the fine-tuned model and corresponding tokenizer
finetune_model.save_pretrained("llama-finetuned")
model_tokenizer.save_pretrained("llama-finetuned")


# Function for inference using a prompt-based approach
def classify_sentiment(text_input):
    prompt_template = f"""
Classify the sentiment of the following sentence as either 'Positive' or 'Negative'.

Examples:
1. "I love this product! It's amazing." → Positive
2. "The service was terrible, I hate it." → Negative
3. "Absolutely wonderful experience, highly recommend!" → Positive
4. "I regret buying this, worst decision ever." → Negative

Now, evaluate this sentence and answer with a single word:

"{text_input}"

Answer:""".strip()

    # Tokenize the prompt and send it to the GPU
    input_tokens = model_tokenizer(
        prompt_template, 
        return_tensors="pt", 
        truncation=True, 
        padding=True
    ).to("cuda")

    # Generate the model output with tuned generation parameters
    with torch.no_grad():
        gen_output = finetune_model.generate(
            **input_tokens, 
            max_new_tokens=10,       # Allow more tokens to ensure complete answer
            temperature=0.0,         # Deterministic output
            top_p=1.0,               # No nucleus sampling
            do_sample=False,         # Disable sampling for consistent output
            pad_token_id=model_tokenizer.eos_token_id
        )

    # Decode the generated tokens into text
    decoded_output = model_tokenizer.decode(gen_output[0], skip_special_tokens=True).strip().lower()

    # Check for sentiment keywords in the output and return the corresponding label
    if "positive" in decoded_output:
        return "Positive"
    elif "negative" in decoded_output:
        return "Negative"
    else:
        return "Unknown"


# Example usage
sample_text = "I absolutely love this experience!"
print(classify_sentiment(sample_text))


DatasetDict({
    train: Dataset({
        features: ['ID', 'sentence', 'label', 'language', 'sentiment'],
        num_rows: 800
    })
    test: Dataset({
        features: ['ID', 'sentence', 'label', 'language', 'sentiment'],
        num_rows: 200
    })
})


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

{'input_ids': [128000, 1999, 1463, 279, 27065, 315, 279, 2768, 11914, 439, 3060, 364, 36590, 6, 477, 364, 39589, 30736, 85664, 25, 100443, 107409, 92911, 101263, 100549, 100914, 109213, 11, 48909, 100907, 320, 100411, 107409, 92911, 101263, 84736, 86133, 92911, 101273, 48909, 100907, 11, 100276, 101385, 102650, 107300, 100443, 107409, 92911, 101263, 110374, 101242, 118744, 100556, 55675, 101002, 100273, 100431, 100358, 110374, 101242, 118744, 100556, 55675, 101002, 86133, 100924, 92911, 48909, 100907, 100549, 100276, 100273, 101002, 86133, 100537, 100855, 24810, 100443, 100731, 100471, 8, 100358, 84736, 101755, 100431, 86133, 102067, 100358, 100320, 126701, 86133, 102232, 69258, 100400, 100361, 100299, 106315, 48909, 44747, 117569, 100303, 100271, 100346, 102359, 100391, 48909, 44747, 85410, 101130, 48909, 113677, 44747, 100277, 100273, 48909, 35470, 100331, 120411, 100273, 44747, 102875, 102223, 85410, 44747, 92317, 117189, 79468, 100273, 100358, 84736, 100664, 101993, 45279, 85410, 1



Epoch,Training Loss,Validation Loss
1,No log,0.630221
2,No log,0.615139
3,0.768300,0.611725




Positive


In [4]:
import pandas as pd

# Load the test data from CSV
test_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/test.csv")

# Display the first few rows to verify the structure
print("Initial Test Data Preview:")
print(test_data.head())

# Apply the inference function to each sentence.
# (Ensure that the function 'classify_sentiment' is defined in your environment.)
test_data["predicted_label"] = test_data["sentence"].apply(classify_sentiment)

# Prepare the submission DataFrame with both the 'ID' and the predicted label columns.
submission_df = test_data[["ID", "predicted_label"]]

# Show a preview of the submission DataFrame
print("Submission Preview:")
print(submission_df.head())

# Export the submission file to CSV with headers (or without, depending on competition requirements)
submission_df.to_csv("submission.csv", index=False)


Initial Test Data Preview:
   ID                                           sentence language
0   1                    1120 mAh, ਓਵਰਚਾਰਜਿੰਗ ਦੀ ਸੁਰੱਖਿਆ       pa
1   2  તે સઘન મોઇશ્ચરાઇઝિંગ પ્રદાન કરે છે અને સરસ સ્વ...       gu
2   3                      1120 ಎಂಎಎಚ್, ಮಿತಿಮೀರಿದ ರಕ್ಷಣೆ       kn
3   4  ভাৰতত নিৰ্মিত সৰ্বশ্ৰেষ্ঠ পাৰফিউম ব্ৰেণ্ডবোৰৰ ...       as
4   5  میں نے حال ہی میں "انفولڈ" سے ایک ٹیمپلیٹ خرید...       ur




Submission Preview:
   ID predicted_label
0   1        Positive
1   2        Positive
2   3        Positive
3   4        Positive
4   5        Positive


In [5]:
def classify_sentiment_multilang(input_sentence):
    # Build a multi-lingual prompt with examples and explicit instructions.
    prompt_content = f"""
Determine whether the sentiment of the following sentence is 'Positive' or 'Negative'.
Please answer with a single word.

Examples:
1. "আমি এই পণ্যটি ভালোবাসি!" → Positive (Bengali)
2. "এই অভিজ্ঞতা খুব খারাপ ছিল।" → Negative (Bengali)

3. "હું આ પ્રોડક્ટથી ખુશ છું!" → Positive (Gujarati)
4. "આ સેવા ખરાબ છે." → Negative (Gujarati)

5. "இந்த சேவை அருமையாக இருந்தது." → Positive (Tamil)
6. "இந்த சேவை மோசமாக இருந்தது." → Negative (Tamil)

7. "ਇਹ ਉਤਪਾਦ ਸ਼ਾਨਦਾਰ ਹੈ!" → Positive (Punjabi)
8. "ਇਹ ਸੇਵਾ ਬਹੁਤ ਹੀ ਮਾੜੀ ਸੀ." → Negative (Punjabi)

9. "এই পরিষেবা চমৎকার ছিল।" → Positive (Bangladeshi Bengali)
10. "এই পরিষেবা খুব খারাপ ছিল।" → Negative (Bangladeshi Bengali)

11. "এটি আমার શ્રેષ્ઠ ખરીદી હતી!" → Positive (Assamese)
12. "এই অভিজ্ঞতা મારા માટે ખરાબ હતી." → Negative (Assamese)

13. "ఈ ఉత్పత్తి అద్భుతంగా ఉంది!" → Positive (Telugu)
14. "ఈ సేవ భయంకరంగా ఉంది." → Negative (Telugu)

15. "ଏହି ଉତ୍ପାଦ ଅତି ଭଲ।" → Positive (Odia)
16. "ଏହି ସେବା ଖୁବ୍ ଖରାପ ଥିଲା।" → Negative (Odia)

17. "ഈ സേവനം മികച്ചതാണ്!" → Positive (Malayalam)
18. "ഈ സേവനം ഭയാനകമായിരുന്നു." → Negative (Malayalam)

19. "यह मेरी सबसे अच्छी खरीद थी।" → Positive (Hindi)
20. "यह सेवा बहुत खराब थी।" → Negative (Hindi)

21. "ही सेवा चांगली होती." → Positive (Marathi)
22. "ही सेवा भयंकर होती." → Negative (Marathi)

23. "یہ بہترین پروڈکٹ ہے!" → Positive (Urdu)
24. "یہ سروس بہت خراب تھی۔" → Negative (Urdu)

25. "ಈ ಉತ್ಪನ್ನವು ಉತ್ತಮವಾಗಿದೆ!" → Positive (Kannada)
26. "ಈ ಸೇವೆ ಭಯಾನಕವಾಗಿತ್ತು." → Negative (Kannada)

Now, analyze this sentence:

"{input_sentence}"

Answer:""".strip()

    # Tokenize the prompt and move it to the GPU
    tokenized_input = tokenizer(prompt_content, return_tensors="pt", truncation=True, padding=True).to("cuda")

    # Generate a response with refined parameters for a deterministic output
    with torch.no_grad():
        generated_response = model.generate(
            **tokenized_input,
            max_new_tokens=10,        # Allow a bit more room for the answer
            temperature=0.0,          # Force a deterministic response
            top_p=1.0,                # Disable nucleus sampling
            do_sample=False,          # Ensure the most likely output is chosen
            pad_token_id=tokenizer.eos_token_id  # Ensure proper termination
        )

    # Decode and clean the generated response
    output_text = tokenizer.decode(generated_response[0], skip_special_tokens=True).strip().lower()

    # Determine and return the sentiment based on the output
    if "positive" in output_text:
        return "Positive"
    elif "negative" in output_text:
        return "Negative"
    else:
        return "Unknown"  # For unexpected outputs
