In [None]:
# !pip install transformers datasets peft torch pandas numpy

In [1]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset, ClassLabel

In [2]:
input_path = "/kaggle/input/qa-tradelane/qa_trade_lane_dataset.csv"

In [3]:
qa_df = pd.read_csv(input_path)

In [4]:
qa_df.head(5)

Unnamed: 0,question,answer
0,Trade Lane Type: exporting. From Region: south...,188
1,Trade Lane Type: exporting. From Region: south...,188
2,Trade Lane Type: exporting. From Region: south...,188
3,Trade Lane Type: exporting. From Region: south...,188
4,Trade Lane Type: exporting. From Region: south...,188


In [5]:
# Load your dataset
dataset = Dataset.from_pandas(qa_df)

In [6]:
set_labels = set(dataset['answer'])

In [7]:
unique_labels = sorted(set_labels)
label_map = {label: idx for idx, label in enumerate(unique_labels)}
print(label_map)
num_classes = len(unique_labels)
print(num_classes)

{188: 0, 191: 1, 196: 2, 197: 3, 198: 4, 199: 5, 205: 6, 416: 7, 582: 8, 849: 9}
10


In [8]:
dataset = dataset.cast_column('answer', ClassLabel(num_classes=num_classes, names=list(unique_labels)))
# Nếu cần chia thành train/test
dataset = dataset.train_test_split(test_size=0.3, stratify_by_column='answer', seed=42)

Casting the dataset:   0%|          | 0/13824 [00:00<?, ? examples/s]

In [9]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

def preprocess_function(examples):
    # Tokenize the texts
    tokenized_inputs = tokenizer(
        examples['question'],  # Assuming 'question' is your input text column
        padding='max_length',
        truncation=True,
        max_length=128,
    )

    # Convert answers to integers if they're not already
    labels = [label_map[int(a)] for a in examples["answer"]]
    result = tokenized_inputs.copy()
    result["labels"] = labels
    return result


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [10]:
tokenized_datasets = {}
for split in ['train', 'test']:
  tokenized_datasets[split] = dataset[split].map(
      preprocess_function,
      batched=True,
      remove_columns=dataset[split].column_names
  )

Map:   0%|          | 0/9676 [00:00<?, ? examples/s]

Map:   0%|          | 0/4148 [00:00<?, ? examples/s]

In [11]:
from transformers import GPT2ForSequenceClassification
from peft import LoraConfig, get_peft_model

# Load base model
model = GPT2ForSequenceClassification.from_pretrained(
    'gpt2',
    num_labels=num_classes,
    pad_token_id=tokenizer.eos_token_id
)

# Define LoRA config
lora_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=16,
    target_modules=["c_attn"],  # Targeting attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"  # Sequence classification
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Should show much fewer trainable params

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 302,592 || all params: 124,750,080 || trainable%: 0.2426




In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Calculate basic metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [13]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    metric_for_best_model="f1",  # Use F1 score to select best model
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # You should have a separate validation set
    compute_metrics=compute_metrics,
)




In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.5819,1.478413,0.400916,0.229469,0.160734,0.400916
2,1.2263,0.634579,0.743491,0.656405,0.827016,0.743491
3,0.655,0.523646,0.755545,0.670221,0.828859,0.755545


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=3630, training_loss=1.2453733396924231, metrics={'train_runtime': 382.2838, 'train_samples_per_second': 75.933, 'train_steps_per_second': 9.496, 'total_flos': 1903113326297088.0, 'train_loss': 1.2453733396924231, 'epoch': 3.0})

In [None]:
# Save the model
model.save_pretrained("gpt2-lora-classification")

# To load later:
from peft import PeftModel
loaded_model = GPT2ForSequenceClassification.from_pretrained('gpt2-medium')
loaded_model = PeftModel.from_pretrained(loaded_model, "gpt2-lora-classification")

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    logits = outputs.logits
    return logits.argmax().item()

# Example usage
question = "Trade Lane Type: exporting. From Region: south..."
predicted_class = predict(question)
print(f"Predicted class: {predicted_class}")