In [6]:
from kaggle_secrets import UserSecretsClient

from huggingface_hub import login

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hf")
login(token=secret_value_0)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification

# Load dataset
data = pd.read_csv("/kaggle/input/orientationtr/orientation-tr-train.tsv", sep="\t")

# Preprocessing: Handle missing values
data = data.dropna(subset=[ 'text_en'])

# Stratified train-test split
train_data, val_data = train_test_split(
    data, test_size=0.1, stratify=data["label"], random_state=42
)

# Save the train and validation datasets
train_data.to_csv("train_data.tsv", sep="\t", index=False)
val_data.to_csv("val_data.tsv", sep="\t", index=False)

# Check class distribution
label_counts = data["label"].value_counts()

# Print class distribution
print("Class Distribution:")
print(label_counts)

# Calculate imbalance ratio
total_samples = label_counts.sum()
imbalance_ratios = label_counts / total_samples
print("\nClass Imbalance Ratios:")
print(imbalance_ratios)


Class Distribution:
label
1    9390
0    6748
Name: count, dtype: int64

Class Imbalance Ratios:
label
1    0.581856
0    0.418144
Name: count, dtype: float64


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification


# Load the saved datasets
train_data = pd.read_csv("train_data.tsv", sep="\t")
val_data = pd.read_csv("val_data.tsv", sep="\t")

print("Datasets loaded successfully.")
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_data(texts, labels):
    return tokenizer(
        list(texts), padding=True, truncation=True, return_tensors="pt"
    ), labels

train_tokens, train_labels = tokenize_data(train_data["text_en"], train_data["label"])
val_tokens, val_labels = tokenize_data(val_data["text_en"], val_data["label"])

model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2, torch_dtype="auto")



Datasets loaded successfully.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer


In [12]:

from datasets import Dataset

# Combine the tokenized data into datasets
train_dataset = Dataset.from_dict({
    "input_ids": train_tokens["input_ids"],
    "attention_mask": train_tokens["attention_mask"],
    "labels": train_labels,
})

eval_dataset = Dataset.from_dict({
    "input_ids": val_tokens["input_ids"],
    "attention_mask": val_tokens["attention_mask"],
    "labels": val_labels,
})

training_args = TrainingArguments(
    output_dir="./orientation-tr",
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none"
)
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.4134,0.361027,0.83767
2,0.3204,0.305267,0.870508
3,0.2207,0.31669,0.86741
4,0.1288,0.36062,0.871128




TrainOutput(global_step=908, training_loss=0.29103502670573766, metrics={'train_runtime': 3365.2687, 'train_samples_per_second': 17.263, 'train_steps_per_second': 0.27, 'total_flos': 1.528569987219456e+16, 'train_loss': 0.29103502670573766, 'epoch': 4.0})

In [None]:
from kaggle_secrets import UserSecretsClient

from huggingface_hub import login

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hf")
login(token=secret_value_0)

In [14]:
# Load dataset
import pandas as pd


model_id = "meta-llama/Llama-3.2-1B"
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model=model_id, device_map="auto")

# Define candidate labels
candidate_labels = ["left wing", "right wing"]

# Predict labels using the classifier
predictions = []
i = 0

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [15]:

predictions = []
for text in val_data["text"]:
    result = classifier(text, candidate_labels=candidate_labels)
    if 'labels' in result:
      predictions.append(result['labels'][0])


# Map predictions and true labels to a common format
# Assuming `data['label']` has values like 0 for "right wing" and 1 for "left wing"
true_labels = val_data["label"].map({1: "right wing", 0: "left wing"})
 
from sklearn.metrics import accuracy_score

# Map the predicted labels back to 0 and 1
label_mapping = {"left wing": 0, "right wing": 1}
predictions_mapped = [label_mapping[label] for label in predictions]

# Calculate accuracy
accuracy = accuracy_score(val_data["label"], predictions_mapped)
print(f"Accuracy: {accuracy:.4f}")



Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Accuracy: 0.4232


In [18]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score

predictions = []

# Loop through val_data["text_en"] with a progress bar
for text in tqdm(val_data["text_en"], desc="Processing texts"):
    result = classifier(text, candidate_labels=candidate_labels)
    if 'labels' in result:
        predictions.append(result['labels'][0])

true_labels = val_data["label"].map({1: "right wing", 0: "left wing"})

# Map the predicted labels back to 0 and 1
label_mapping = {"left wing": 0, "right wing": 1}
predictions_mapped = [label_mapping[label] for label in predictions]

# Calculate accuracy
accuracy = accuracy_score(val_data["label"], predictions_mapped)
print(f"Accuracy: {accuracy:.4f}")


Processing texts: 100%|██████████| 1614/1614 [30:10<00:00,  1.12s/it]

Accuracy: 0.4504



