## 1Ô∏è‚É£ Install Required Libraries

Install semua library yang diperlukan untuk fine-tuning dengan LoRA.

In [2]:
!pip install -q transformers peft datasets accelerate kagglehub torch scikit-learn pyyaml
print("‚úì Semua library berhasil diinstall!")

‚úì Semua library berhasil diinstall!


## 2Ô∏è‚É£ Import Dependencies

Import semua library yang diperlukan.

In [3]:
import os
import json
import yaml
import torch
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel,
    PeftConfig
)
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

Device: cuda
GPU: Tesla T4
Memory: 15.83 GB


## 3Ô∏è‚É£ Create Configuration File

Buat file konfigurasi untuk mengatur semua hyperparameter. Ini memudahkan eksperimen dengan berbagai setting.

In [None]:
config = {
    # Model Configuration
    'model': {
        'name': 'gpt2',  # Options: gpt2, gpt2-medium, gpt2-large, gpt2-xl
        'max_length': 128
    },
    
    # LoRA Configuration
    'lora': {
        'r': 8,  # LoRA rank (4-16 recommended)
        'lora_alpha': 16,  # Scaling parameter (usually 2*r)
        'lora_dropout': 0.1,
        'target_modules': ['c_attn'],  # GPT-2 attention modules
        'bias': 'none',
        'task_type': 'SEQ_CLS'
    },
    
    # Dataset Configuration
    'dataset': {
        'train_split': 0.8,
        'val_split': 0.1,
        'test_split': 0.1,
        'random_seed': 42,
        'min_samples_per_class': 100,  # Minimum tweets per author (increased for Twitter)
        'max_classes': 50,  # Maximum number of authors to include (top N by tweet count)
        'max_samples_per_class': None,  # None = no limit
        'text_column': 'text',
        'label_column': 'author_id'
    },
    
    # Training Configuration
    'training': {
        'output_dir': './results',
        'num_train_epochs': 3,
        'per_device_train_batch_size': 8,
        'per_device_eval_batch_size': 8,
        'gradient_accumulation_steps': 1,
        'learning_rate': 3e-4,
        'weight_decay': 0.01,
        'warmup_steps': 500,
        'logging_steps': 50,
        'eval_steps': 500,
        'save_steps': 500,
        'save_total_limit': 2,
        'load_best_model_at_end': True,
        'metric_for_best_model': 'eval_loss',
        'greater_is_better': False,
        'fp16': True if device == 'cuda' else False,
    },
    
    # Misc
    'misc': {
        'seed': 42,
        'logging_dir': './logs'
    }
}

‚úì Konfigurasi berhasil dibuat!

üìã Configuration Preview:
dataset:
  label_column: author_id
  max_samples_per_class: null
  min_samples_per_class: 10
  random_seed: 42
  test_split: 0.1
  text_column: text
  train_split: 0.8
  val_split: 0.1
lora:
  bias: none
  lora_alpha: 16
  lora_dropout: 0.1
  r: 8
  target_modules:
  - c_attn
  task_type: SEQ_CLS
misc:
  logging_dir: ./logs
  seed: 42
model:
  max_length: 128
  name: gpt2
training:
  eval_steps: 500
  fp16: true
  gradient_accumulation_steps: 1
  greater_is_better: false
  learning_rate: 0.0003
  load_best_model_at_end: true
  logging_steps: 50
  metric_for_best_model: eval_loss
  num_train_epochs: 3
  output_dir: ./results
  per_device_eval_batch_size: 8
  per_device_train_batch_size: 8
  save_steps: 500
  save_total_limit: 2
  warmup_steps: 500
  weight_decay: 0.01



## 4Ô∏è‚É£ Load Configuration

Load konfigurasi dari file.

In [5]:
# Set seed for reproducibility
set_seed(config['misc']['seed'])

print("‚úì Konfigurasi loaded!")
print(f"Model: {config['model']['name']}")
print(f"LoRA rank: {config['lora']['r']}")
print(f"Epochs: {config['training']['num_train_epochs']}")
print(f"Learning rate: {config['training']['learning_rate']}")

‚úì Konfigurasi loaded!
Model: gpt2
LoRA rank: 8
Epochs: 3
Learning rate: 0.0003


## 5Ô∏è‚É£ Load and Explore Twitter Support Dataset

Load dataset Twitter Support dari file lokal.

In [68]:
# Load Twitter Support dataset from local CSV
import pandas as pd

print("üì• Loading Twitter Support dataset...")
try:
    # Load from local CSV file
    csv_file = "/Users/azzam_hanif/Documents/04_KULIah/03_SUDI_MANDIRI/experiment/data-distribution/twcs/twcs.csv"
    print(f"Loading from: {csv_file}")
    df = pd.read_csv(csv_file)
    print(f"Dataset loaded with {len(df)} samples")

    # Use 'author_id' as label (predicting which company/author based on tweet text)
    # Keep only text and author_id columns
    df = df[['text', 'author_id']].dropna()
    
    print("‚úì Dataset loaded successfully")

except Exception as e:
    print("‚ùå Failed to load dataset.")
    print(f"Error: {e}")
    raise e

print("shape", df.shape)
print("First 5 records:")
print(df.head())

üì• Loading Twitter Support dataset...
Loading from: /Users/azzam_hanif/Documents/04_KULIah/03_SUDI_MANDIRI/experiment/data-distribution/twcs/twcs.csv
‚ùå Failed to load dataset.
Error: [Errno 2] No such file or directory: '/Users/azzam_hanif/Documents/04_KULIah/03_SUDI_MANDIRI/experiment/data-distribution/twcs/twcs.csv'


FileNotFoundError: [Errno 2] No such file or directory: '/Users/azzam_hanif/Documents/04_KULIah/03_SUDI_MANDIRI/experiment/data-distribution/twcs/twcs.csv'

In [None]:
# Display sample data
print("üìù Sample Data:")
print(df.head(10))

# For Twitter Support dataset (multi-class author classification)
print("\n‚úì Twitter Support dataset detected (author classification)")

print("\n" + "="*70)
print("DATASET DISTRIBUTION")
print("="*70)

# Twitter authors
author_counts = df['author_id'].value_counts()
print(f"Total unique authors: {len(author_counts)}")
print(f"Total tweets: {len(df)}")
print(f"\nAuthor distribution (top 20):")
for i, (author, count) in enumerate(author_counts.head(20).items(), 1):
    percentage = (count / len(df)) * 100
    print(f"{i:2d}. {str(author):30s}: {count:6d} tweets ({percentage:5.2f}%)")
if len(author_counts) > 20:
    print(f"     ... and {len(author_counts) - 20} more authors")

üìù Sample Data:
                                                text  author_id
0  Stuning even for the non-gamer: This sound tra...          1
1  The best soundtrack ever to anything.: I'm rea...          1
2  Amazing!: This soundtrack is my favorite music...          1
3  Excellent Soundtrack: I truly like this soundt...          1
4  Remember, Pull Your Jaw Off The Floor After He...          1
5  an absolute masterpiece: I am quite sure any o...          1
6  Buyer beware: This is a self-published book, a...          0
7  Glorious story: I loved Whisper of the wicked ...          1
8  A FIVE STAR BOOK: I just finished reading Whis...          1
9  Whispers of the Wicked Saints: This was a easy...          1

‚úì Amazon Reviews dataset detected (binary sentiment classification)

DATASET DISTRIBUTION
Total unique authors/airlines: 2
Total tweets: 3600000

All authors/airlines:
 1. 1                             : 1800000 tweets
 2. 0                             : 1800000 tweets

DATA

## 6Ô∏è‚É£ Preprocess Dataset

Preprocess dataset: filter, create labels, dan split train/val/test.

In [None]:
text_col = config['dataset']['text_column']
label_col = config['dataset']['label_column']

# Remove missing values
print(f"Original size: {len(df)}")
df = df[[text_col, label_col]].dropna()
print(f"After removing NaN: {len(df)}")

# Filter by minimum samples per class
min_samples = config['dataset']['min_samples_per_class']
author_counts = df[label_col].value_counts()
valid_authors = author_counts[author_counts >= min_samples].index
df = df[df[label_col].isin(valid_authors)]
print(f"After filtering (min {min_samples} samples): {len(df)}")

# Select top N authors by tweet count
max_classes = config['dataset']['max_classes']
if max_classes and len(author_counts) > max_classes:
    top_authors = author_counts.head(max_classes).index
    df = df[df[label_col].isin(top_authors)]
    print(f"After selecting top {max_classes} authors: {len(df)}")

# Create label mapping
unique_authors = sorted(df[label_col].unique())
label2id = {author: idx for idx, author in enumerate(unique_authors)}
id2label = {idx: author for author, idx in label2id.items()}

df['label'] = df[label_col].map(label2id)

print(f"\n‚úì Preprocessing complete!")
print(f"Number of classes: {len(unique_authors)}")
print(f"Final dataset size: {len(df)}")

Original size: 14640
After removing NaN: 14640
After filtering (min 10 samples): 14640

‚úì Preprocessing complete!
Number of classes: 6
Final dataset size: 14640


In [9]:
# Split dataset
train_split = config['dataset']['train_split']
val_split = config['dataset']['val_split']
test_split = config['dataset']['test_split']
seed = config['dataset']['random_seed']

# Train vs (Val + Test)
train_df, temp_df = train_test_split(
    df, 
    test_size=(val_split + test_split),
    random_state=seed,
    stratify=df['label']
)

# Val vs Test
val_ratio = val_split / (val_split + test_split)
val_df, test_df = train_test_split(
    temp_df,
    test_size=(1 - val_ratio),
    random_state=seed,
    stratify=temp_df['label']
)

print(f"\n{'='*70}")
print("DATASET SPLITS")
print(f"{'='*70}")
print(f"Train: {len(train_df)} samples ({len(train_df)/len(df)*100:.1f}%)")
print(f"Val: {len(val_df)} samples ({len(val_df)/len(df)*100:.1f}%)")
print(f"Test: {len(test_df)} samples ({len(test_df)/len(df)*100:.1f}%)")

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

print("\n‚úì Dataset split berhasil!")


DATASET SPLITS
Train: 11712 samples (80.0%)
Val: 1464 samples (10.0%)
Test: 1464 samples (10.0%)

‚úì Dataset split berhasil!


## 7Ô∏è‚É£ Load GPT-2 Model with LoRA

Load base GPT-2 model untuk sequence classification.

In [10]:
model_name = config['model']['name']
num_labels = len(label2id)

print(f"üì¶ Loading {model_name} model...")
print(f"Number of labels (authors): {num_labels}")

# Load model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

print(f"‚úì Model loaded: {model_name}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

üì¶ Loading gpt2 model...
Number of labels (authors): 6


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úì Model loaded: gpt2
Total parameters: 124,444,416


## 8Ô∏è‚É£ Configure LoRA Parameters

Aplikasikan LoRA configuration ke model.

In [11]:
# Configure LoRA
lora_config = LoraConfig(
    r=config['lora']['r'],
    lora_alpha=config['lora']['lora_alpha'],
    lora_dropout=config['lora']['lora_dropout'],
    target_modules=config['lora']['target_modules'],
    bias=config['lora']['bias'],
    task_type=TaskType.SEQ_CLS
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

print("‚úì LoRA applied to model!")
print(f"\n{'='*70}")
print("LoRA Configuration")
print(f"{'='*70}")
print(f"Rank (r): {config['lora']['r']}")
print(f"Alpha: {config['lora']['lora_alpha']}")
print(f"Dropout: {config['lora']['lora_dropout']}")
print(f"Target modules: {config['lora']['target_modules']}")

# Print trainable parameters
model.print_trainable_parameters()

‚úì LoRA applied to model!

LoRA Configuration
Rank (r): 8
Alpha: 16
Dropout: 0.1
Target modules: ['c_attn']
trainable params: 299,520 || all params: 124,743,936 || trainable%: 0.2401


## 9Ô∏è‚É£ Prepare Tokenizer and Data Collator

Setup tokenizer dan data collator untuk preprocessing.

In [12]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT-2 doesn't have pad token, so we add one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization function
def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=config['model']['max_length'],
        padding=False  # Will be handled by data collator
    )

# Tokenize datasets
print("üîÑ Tokenizing datasets...")
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=['text'])
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=['text'])
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=['text'])

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("‚úì Tokenization complete!")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

üîÑ Tokenizing datasets...


Map:   0%|          | 0/11712 [00:00<?, ? examples/s]

Map:   0%|          | 0/1464 [00:00<?, ? examples/s]

Map:   0%|          | 0/1464 [00:00<?, ? examples/s]

‚úì Tokenization complete!


## üîü Setup Training Arguments

Configure training arguments dari config file.

In [None]:
# Setup training arguments
training_args = TrainingArguments(
    output_dir=config['training']['output_dir'],
    num_train_epochs=config['training']['num_train_epochs'],
    per_device_train_batch_size=config['training']['per_device_train_batch_size'],
    per_device_eval_batch_size=config['training']['per_device_eval_batch_size'],
    gradient_accumulation_steps=config['training']['gradient_accumulation_steps'],
    learning_rate=config['training']['learning_rate'],
    weight_decay=config['training']['weight_decay'],
    warmup_steps=config['training']['warmup_steps'],
    logging_dir=config['misc']['logging_dir'],
    logging_steps=config['training']['logging_steps'],
    eval_strategy='steps',  # Updated from evaluation_strategy
    eval_steps=config['training']['eval_steps'],
    save_steps=config['training']['save_steps'],
    save_total_limit=config['training']['save_total_limit'],
    load_best_model_at_end=config['training']['load_best_model_at_end'],
    metric_for_best_model=config['training']['metric_for_best_model'],
    greater_is_better=config['training']['greater_is_better'],
    fp16=config['training']['fp16'],
    report_to='none',
    seed=config['misc']['seed'],
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

## 1Ô∏è‚É£1Ô∏è‚É£ Initialize Trainer

Setup Trainer dengan model, data, dan training arguments.

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("‚úì Trainer initialized!")
print(f"Ready to train on {len(train_dataset)} samples")

‚úì Trainer initialized!
Ready to train on 10 samples


## 1Ô∏è‚É£2Ô∏è‚É£ Start Training

Mulai proses fine-tuning model dengan LoRA.

In [None]:
# Start training
print("üöÄ Starting training...")
print(f"Training on {len(train_dataset)} samples")
print(f"Validating on {len(val_dataset)} samples")
print(f"Model: {model_name} with LoRA (r={config['lora']['r']})")
print(f"Epochs: {config['training']['num_train_epochs']}")
print("-" * 50)

# Train the model
train_result = trainer.train()

print("\n‚úì Training completed!")
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Samples/second: {train_result.metrics['train_samples_per_second']:.2f}")
print(f"Final train loss: {train_result.metrics['train_loss']:.4f}")

# Save the model
trainer.save_model("./final_model")
print("‚úì Model saved to ./final_model")

## 1Ô∏è‚É£3Ô∏è‚É£ Evaluate Model

Evaluasi performa model pada test set.

In [None]:
# Evaluate on test set
print("üìä Evaluating model on test set...")
test_results = trainer.evaluate(test_dataset)

print("\n" + "="*70)
print("TEST RESULTS")
print("="*70)
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Precision: {test_results['eval_precision']:.4f}")
print(f"Recall: {test_results['eval_recall']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")
print(f"Loss: {test_results['eval_loss']:.4f}")

# Get predictions for detailed analysis
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Classification report
print("\n" + "="*70)
print("CLASSIFICATION REPORT")
print("="*70)
report = classification_report(labels, preds, target_names=list(id2label.values())[:10], zero_division=0)
print(report)

print("‚úì Evaluation completed!")