In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llama-3.1/transformers/8b-instruct/2/model.safetensors.index.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00003-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/LICENSE
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00001-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/README.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/USE_POLICY.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer_config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00004-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/special_tokens_map.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/.gitattributes
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00002-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/gener

# Basic Downloads

In [2]:
%%capture
!pip install bitsandbytes
!pip install accelerate
!pip install peft
!pip install evaluate
!pip install --upgrade transformers

In [3]:
from datasets import load_from_disk, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model
import torch
from rich import print as rprint

In [4]:
dataset = load_dataset("csv", data_files="/kaggle/input/multi-lingual-sentiment-analysis/train.csv")

dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'sentence', 'label', 'language'],
        num_rows: 1000
    })
})

In [5]:
from collections import Counter

# Iterate over each dataset split (train, test, validation, etc.)
for split, ds in dataset.items():
    rprint(f"=== Unique value counts for {split} set ===\n")
    columns = ['label', 'language']
    for column in columns:
        unique_counts = Counter(ds[column])  # Count occurrences of each unique value
        rprint(f"Column: {column}, Unique values: {len(unique_counts)}")
        rprint(f"Sample values: {dict(list(unique_counts.items()))}")  # Print first 5 unique values
    print("\n")





In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'sentence', 'label', 'language'],
        num_rows: 1000
    })
})

In [7]:
def fix_labels(example):
    label_map = {"Positive": 1, "Negative": 0}  # Map labels to integers
    example["label"] = label_map.get(example["label"], -1)  # Assign -1 for unknown labels
    return example

dataset = dataset.map(fix_labels)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Loading the model and configuring it.

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from transformers import DataCollatorWithPadding
from transformers import LlamaConfig, LlamaForCausalLM,LlamaForSequenceClassification
from transformers import TrainingArguments, Trainer

In [9]:
# # Quantization configuration
# model_path = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=False,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

# # Loading the model and tokenizer

# model = AutoModelForCausalLM.from_pretrained(model_path,quantization_config=bnb_config,
#                                              device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(
#     model_path,
#     model_max_length=1024,
#     padding_side="left",
#     add_eos_token=True)
# tokenizer.pad_token = tokenizer.eos_token

In [10]:
model_id = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"

tokenizer = AutoTokenizer.from_pretrained(model_id, model_max_length=1024)
# set pad token id
tokenizer.pad_token=tokenizer.eos_token

In [11]:
def tokenize(example):
    return tokenizer(example["sentence"], padding=True, truncation=True, max_length=512)

In [12]:
tokenized_ds = dataset.map(tokenize, batched=True, num_proc=4, remove_columns=['sentence'])
print(tokenized_ds)

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'label', 'language', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})


In [13]:
ds_split = tokenized_ds['train'].train_test_split(test_size=0.07,seed=50)
print(ds_split)

DatasetDict({
    train: Dataset({
        features: ['ID', 'label', 'language', 'input_ids', 'attention_mask'],
        num_rows: 930
    })
    test: Dataset({
        features: ['ID', 'label', 'language', 'input_ids', 'attention_mask'],
        num_rows: 70
    })
})


# Data Collator

In [14]:
data_collator = DataCollatorWithPadding(tokenizer,padding='max_length', max_length=512)

In [15]:
import torch, gc
# gc.collect()
# torch.cuda.empty_cache()

In [16]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    llm_int8_enable_fp32_cpu_offload=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                                           num_labels=2,
                                                           pad_token_id=tokenizer.eos_token_id,
                                                           quantization_config=bnb_config,
                                                           device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/llama-3.1/transformers/8b-instruct/2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
import evaluate
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

# Checking Models and Tokenizers

In [18]:
print(model)

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128009)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
 

# Fine Tune with LoRA

In [19]:
#lora
from peft import LoraConfig, TaskType, LoraModel
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    lora_alpha=32,
    lora_dropout=0.05
)

In [20]:
from peft import get_peft_model
model = prepare_model_for_kbit_training(model)
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()
print(lora_model)

trainable params: 6,823,936 || all params: 7,511,756,800 || trainable%: 0.0908
PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128009)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_

In [21]:
lora_model.peft_config

{'default': LoraConfig(task_type=<TaskType.SEQ_CLS: 'SEQ_CLS'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/kaggle/input/llama-3.1/transformers/8b-instruct/2', revision=None, inference_mode=False, r=16, target_modules={'v_proj', 'q_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=['classifier', 'score'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)}

In [22]:
training_args = TrainingArguments( output_dir='lora_llama_8b_ct',
                                  eval_strategy="steps",
                                  eval_steps=50,
                                  num_train_epochs=2,
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  bf16=False,
                                  fp16=True,
                                  tf32=False,
                                  gradient_accumulation_steps=2,
                                  adam_beta1=0.05,
                                  adam_beta2=0.995,
                                  learning_rate=1e-4,
                                  weight_decay=0.02,
                                  logging_dir='logs',
                                  logging_strategy="steps",
                                  logging_steps = 50,
                                  save_steps=50,
                                  save_total_limit=20,
                                  report_to='none',
                                  half_precision_backend = 'amp',
                                  load_best_model_at_end = True,
                                  #use_reentrant=True
                                )

In [23]:
trainer = Trainer(model=lora_model,
                  args = training_args,
                  train_dataset=ds_split["train"],
                  eval_dataset=ds_split["test"],
                  compute_metrics = compute_metrics,
                  data_collator = data_collator)

In [24]:
results = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,F1
50,1.341,0.695504,0.66811
100,0.4227,0.309561,0.914005
150,0.2227,0.340995,0.92844
200,0.2508,0.296722,0.928557


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


# Let's Evaluate

In [25]:
test_dataset = load_dataset("csv", data_files="/kaggle/input/multi-lingual-sentiment-analysis/test.csv")

test_dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'sentence', 'language'],
        num_rows: 100
    })
})

In [26]:
sample_sub = load_dataset("csv", data_files="/kaggle/input/multi-lingual-sentiment-analysis/sample_submission.csv")

sample_sub['train'][0]

Generating train split: 0 examples [00:00, ? examples/s]

{'ID': 1, 'label': 'Positive'}

# Inference

In [27]:
from transformers import TextClassificationPipeline
classifier = TextClassificationPipeline(model=model,
                                       tokenizer=tokenizer,
                                       framework='pt',
                                       task="sentiment-analysis",
                                       #device = "cuda"
                                       )

Device set to use cuda:0


In [28]:
model.config.id2label = {0:"Negative",1:"Positive"}

In [29]:
sample = test_dataset['train'][0]['sentence']
print(f"Sample: {sample}")
prediction = classifier(sample)
print(prediction)

Sample: 1120 mAh, ਓਵਰਚਾਰਜਿੰਗ ਦੀ ਸੁਰੱਖਿਆ
[{'label': 'Positive', 'score': 0.9206250905990601}]


In [30]:
# Extract text data and ID from test_dataset (DatasetDict format)
test_texts = test_dataset["train"]["sentence"]  # Adjust key if needed
test_ids = test_dataset["train"]["ID"]  # Existing ID column

In [31]:
# Run inference using the pipeline
predictions = classifier(test_texts, batch_size=32)  # Batched for efficiency

# Convert pipeline output to labels
predicted_labels = [pred["label"] for pred in predictions]

In [32]:
# Save results to CSV
output_df = pd.DataFrame({"ID": test_ids, "label": predicted_labels})
output_df.to_csv("submission.csv", index=False)

In [33]:
print(output_df.head(10))

   ID     label
0   1  Positive
1   2  Positive
2   3  Positive
3   4  Positive
4   5  Negative
5   6  Negative
6   7  Positive
7   8  Positive
8   9  Negative
9  10  Positive


In [34]:
sample = test_dataset['train'][11]['sentence']
print(f"Sample: {sample}")
prediction = classifier(sample)
print(prediction)

Sample: పాత బాడీ షేమింగ్ జోక్‌లు, మమ్మల్ని నవ్వించడానికి చాలా కష్టపడతాయి. ట్రైలర్ చూస్తే సినిమా గురించి ఇప్పటికే 90% తెలిసిపోతుంది
[{'label': 'Negative', 'score': 0.9348655939102173}]
