<a href="https://colab.research.google.com/github/anantapk03/farmer-rice-chatbot-model/blob/main/finetuningchatbotmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets torch accelerate nltk

In [None]:
from google.colab import files
import pandas as pd
from google.colab import drive
import math
from sklearn.model_selection import train_test_split
# Dataset Class
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch
import csv
import math
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import time
import matplotlib.pyplot as plt

In [None]:
drive.mount('/content/drive')

In [None]:
TOTAL_TRIAL_DATA = 800
TOTAL_EPOCH_TRIAL = 10

# **LOAD DATASET**

In [None]:
# Upload semua file CSV
uploaded = files.upload()

In [None]:
#  Tentukan jumlah file yang ingin dimuat (misal: 30 file dari 91 ke bawah)
num_files_to_load = 8

# Tentukan angka tertinggi
start_number = 91

# Hitung angka terendah berdasarkan selisih dari start_number
end_number = start_number - num_files_to_load + 1 # +1 karena range tidak inklusif di akhir

# Pastikan end_number tidak kurang dari 1 (jika memang file Anda bernomor dari 1)
if end_number < 1:
    end_number = 1

# Buat list nama file secara dinamis
# Range akan berjalan dari start_number hingga end_number (inklusif) secara mundur
file_names = [f'{i}.csv' for i in range(start_number, end_number - 1, -1)]

# Cetak file_names untuk memeriksa hasilnya (opsional)
print(file_names)

# Load semua CSV ke dalam DataFrame dan gabungkan
dfs = [pd.read_csv(file) for file in file_names]
df = pd.concat(dfs, ignore_index=True)

# Cek hasil gabungan data
df.head()

cleanedDataDuplicate = df.drop_duplicates(keep="first")
cleanedDataDuplicate.to_csv("/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/DATASET/dataset_"+str(TOTAL_TRIAL_DATA)+"_ROW.csv", index=False)

# Anggap 'cleanedDataDuplicate' adalah DataFrame hasil gabungan dan sudah bersih
group_size = 11

# Membagi DataFrame ke dalam list of groups, tiap group berisi 11 baris
groups = [cleanedDataDuplicate.iloc[i:i+group_size] for i in range(0, len(cleanedDataDuplicate), group_size)]

# Pastikan hanya group lengkap yang digunakan (jika sisa < 11 baris, diabaikan)
groups = [g for g in groups if len(g) == group_size]

# Split dengan proporsi 70% train, 20% valid, 10% test
train_groups, temp_groups = train_test_split(groups, test_size=0.3, random_state=42)
val_groups, test_groups = train_test_split(temp_groups, test_size=1/3, random_state=42)  # 1/3 dari 30% = 10%

# Gabungkan kembali setiap list of groups ke satu DataFrame
train_df = pd.concat(train_groups, ignore_index=True)
val_df = pd.concat(val_groups, ignore_index=True)
test_df = pd.concat(test_groups, ignore_index=True)

# Simpan ke file
train_df.to_csv("/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/DATASET/train.csv", index=False)
val_df.to_csv("/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/DATASET/valid.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/DATASET/test.csv", index=False)

print(cleanedDataDuplicate)

# **CLASS DATASET**

In [None]:
class ChatData(Dataset):
    def __init__(self, path: str, tokenizer, max_length: int = 150): # Reduced max_length
        df = pd.read_csv(path)
        df.dropna(subset=['pertanyaan', 'jawaban'], inplace=True)
        df.drop_duplicates(subset=['pertanyaan', 'jawaban'], inplace=True)

        self.X = [
            f"<startofstring> {row['pertanyaan']} <bot>: {row['jawaban']} <endofstring>"
            for _, row in df.iterrows()
        ]

        self.X_encoded = tokenizer(
            self.X,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]

# **EVALUATION MODEL**

In [None]:
def log_metrics(epoch, train_loss, val_loss, test_loss, train_perplexity, val_perplexity, test_perplexity, train_bleu_score, val_bleu_score, test_bleu_score):
    with open("/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/EPOCH_"+str(TOTAL_EPOCH_TRIAL)+"/training_validation_log.csv", "a", newline='') as f:
        writer = csv.writer(f)
        if f.tell() == 0:
            # Tambahkan header baru
            writer.writerow(["epoch", "train_loss", "val_loss", "test_loss",
                             "train_perplexity", "val_perplexity", "test_perplexity",
                             "train_bleu_score", "val_bleu_score", "test_bleu_score"])
        # Ubah baris data
        writer.writerow([epoch, train_loss, val_loss, test_loss,
                         train_perplexity, val_perplexity, test_perplexity,
                         train_bleu_score, val_bleu_score, test_bleu_score])

def compute_bleu(preds, refs):
    smoothie = SmoothingFunction().method4
    scores = [
        sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
        for pred, ref in zip(preds, refs)
    ]
    return sum(scores) / len(scores)

def evaluate(val_loader, model, tokenizer):
    model.eval()
    total_loss = 0
    preds = []
    refs = []

    with torch.no_grad():
        for X, a in val_loader:
            X, a = X.to(device), a.to(device)
            outputs = model(X, attention_mask=a, labels=X)
            loss = outputs.loss
            total_loss += loss.item()

            # Generate predictions
            generated = model.generate(X, attention_mask=a, max_length=X.size(1),  max_new_tokens=150)
            decoded_preds = [tokenizer.decode(g, skip_special_tokens=True) for g in generated]
            decoded_refs = [tokenizer.decode(x, skip_special_tokens=True) for x in X]

            preds.extend(decoded_preds)
            refs.extend(decoded_refs)

    avg_loss = total_loss / len(val_loader)
    perplexity = math.exp(avg_loss)
    bleu_score = compute_bleu(preds, refs)

    return avg_loss, perplexity, bleu_score


In [None]:
# !pip install -U bitsandbytes
# # !rm -rf /root/.cache/huggingface/tokenizers/*
# !pip install --upgrade transformers accelerate bitsandbytes

# **TRAIN MODEL FUNCTION**

In [None]:
def train(train_loader, val_loader, test_loader, model, optim):
    epochs = TOTAL_EPOCH_TRIAL
    model.train()

    for epoch in tqdm.tqdm(range(1, epochs + 1)):
        total_train_loss = 0
        train_batch_count = 0

        # Inisialisasi untuk BLEU train
        train_references = []
        train_hypotheses = []

        for X, a in train_loader:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()

            total_train_loss += loss.item() # Gunakan total_train_loss
            train_batch_count += 1 # Gunakan train_batch_count

            # === Tambahan untuk menghitung BLEU train ===
            # Generate predictions for BLEU calculation
            with torch.no_grad(): # Gunakan no_grad() saat generate untuk BLEU train
                outputs = model.generate(X, attention_mask=a, max_length=X.size(1), max_new_tokens=150) # Sesuaikan max_length
            decoded_refs = [tokenizer.decode(x, skip_special_tokens=True) for x in X]
            decoded_preds = [tokenizer.decode(g, skip_special_tokens=True) for g in outputs]

            train_references.extend(decoded_refs)
            train_hypotheses.extend(decoded_preds)

            # === Akhir tambahan ===


        # === Hitung Metrik untuk Train ===
        avg_train_loss = total_train_loss / train_batch_count
        train_perplexity = math.exp(avg_train_loss)
        train_bleu_score = compute_bleu(train_hypotheses, train_references) # Hitung BLEU train


        # === Evaluasi pada Validation Set ===
        model.eval()
        val_loss, val_perplexity, val_bleu_score = evaluate(val_loader, model, tokenizer) # Panggil evaluate untuk validation

        # === Evaluasi pada Test Set ===
        test_loss, test_perplexity, test_bleu_score = evaluate(test_loader, model, tokenizer) # Panggil evaluate untuk test

        model.train() # Kembali ke mode train

        # Logging (modifikasi parameter)
        log_metrics(epoch, avg_train_loss, val_loss, test_loss, train_perplexity, val_perplexity, test_perplexity, train_bleu_score, val_bleu_score, test_bleu_score) # Tambahkan train_perplexity dan train_bleu_score

        # Simpan model
        torch.save(model.state_dict(), "/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/EPOCH_"+str(TOTAL_EPOCH_TRIAL)+"/model_state.pt")

       # Output (modifikasi pencetakan)
        print(f"Epoch {epoch}")
        print(f"Train Loss = {avg_train_loss:.4f}")
        print(f"Val Loss   = {val_loss:.4f}")
        print(f"Test Loss  = {test_loss:.4f}")
        print(f"Train Perplexity = {train_perplexity:.4f}")
        print(f"Val Perplexity = {val_perplexity:.4f}")
        print(f"Test Perplexity = {test_perplexity:.4f}") # Tambahkan print test perplexity
        print(f"Train BLEU Score = {train_bleu_score:.2f}")
        print(f"Val BLEU Score = {val_bleu_score:.2f}")
        print(f"Test BLEU Score = {test_bleu_score:.2f}")

def infer(inp):
    inp = "<startofstring> "+inp
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a, max_new_tokens=150, eos_token_id=tokenizer.eos_token_id) # Contoh max_length
    output = tokenizer.decode(output[0])
    return output


In [None]:
import huggingface_hub
huggingface_hub.login("[YOUR_HUGGINGFACETOKENHERE]")

# **START**

In [None]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

base_model = "afrizalha/Bakpia-V1-0.5B-Javanese"

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = AutoModelForCausalLM.from_pretrained(base_model) # Sesuaikan dengan tokenizer
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

# Load dataset
train_data = ChatData("/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/DATASET/train.csv", tokenizer)
val_data = ChatData("/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/DATASET/valid.csv", tokenizer)
test_data = ChatData("/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/DATASET/test.csv", tokenizer) # Pastikan test_data dibuat


train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32) # Buat test_loader

model.train()

optim = Adam(model.parameters(), lr=1e-3)

In [None]:
print("training .... ")

start_time = time.time() # Catat waktu mulai

train(train_loader, val_loader, test_loader, model, optim)

end_time = time.time() # Catat waktu selesai
training_duration = end_time - start_time # Hitung durasi

print("Successfully finetuning model!")
print(f"Total training duration: {training_duration:.2f} seconds") # Cetak durasi

# **VISUALIZE MODEL INFO**

In [None]:
log_df = pd.read_csv("/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/EPOCH_"+str(TOTAL_EPOCH_TRIAL)+"/training_validation_log.csv")

plt.figure(figsize=(15, 10)) # Sesuaikan ukuran figure

# Pilih semua kolom kecuali 'epoch'
columns_to_average = log_df.columns.drop('epoch')

# Hitung rata-rata untuk kolom-kolom tersebut
average_values = log_df[columns_to_average].mean()

# Tampilkan hasilnya
print("Rata-rata nilai untuk setiap metrik:")
print(average_values)

# Plot Loss
plt.subplot(2, 2, 1) # Ubah subplot menjadi 2x2
plt.plot(log_df["epoch"], log_df["train_loss"], label="Train Loss")
plt.plot(log_df["epoch"], log_df["val_loss"], label="Val Loss")
plt.plot(log_df["epoch"], log_df["test_loss"], label="Test Loss")
plt.title("Train, Validation, and Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

# Plot Perplexity
plt.subplot(2, 2, 2) # Plot perplexity di subplot kedua
plt.plot(log_df["epoch"], log_df["train_perplexity"], label="Train Perplexity")
plt.plot(log_df["epoch"], log_df["val_perplexity"], label="Val Perplexity")
plt.plot(log_df["epoch"], log_df["test_perplexity"], label="Test Perplexity")
plt.title("Train, Validation, and Test Perplexity")
plt.xlabel("Epoch")
plt.ylabel("Perplexity")
plt.legend()

# Plot BLEU Score
plt.subplot(2, 2, 3) # Plot BLEU score di subplot ketiga
plt.plot(log_df["epoch"], log_df["train_bleu_score"], label="Train BLEU")
plt.plot(log_df["epoch"], log_df["val_bleu_score"], label="Val BLEU")
plt.plot(log_df["epoch"], log_df["test_bleu_score"], label="Test BLEU")
plt.title("Train, Validation, and Test BLEU Score")
plt.xlabel("Epoch")
plt.ylabel("BLEU Score")
plt.legend()

plt.tight_layout() # Mengatur layout agar tidak tumpang tindih
plt.show()

# **INFERENCE MACHINE**

In [None]:
#TEST MODEL
print("infer from model : ")
while True:
  inp = input()
  print(infer(inp))

### Instalasi Library

In [None]:
!pip install -q transformers datasets torch accelerate nltk rouge_score

### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Load Tokenizer dan Model Arsitektur

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import math
import tqdm
import csv
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader

# Pastikan punkt tokenizer NLTK sudah diunduh
nltk.download('punkt')

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("afrizalha/Bakpia-V1-0.5B-Javanese")
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = AutoModelForCausalLM.from_pretrained("afrizalha/Bakpia-V1-0.5B-Javanese") # Sesuaikan dengan tokenizer
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

### Load Model State Dictionary

In [None]:
# Tentukan path ke file model_state.pt yang sudah Anda simpan
# Sesuaikan path ini dengan lokasi file Anda
TOTAL_TRIAL_DATA = 800 # Ganti dengan nilai yang sesuai jika berbeda
TOTAL_EPOCH_TRIAL = 10 # Ganti dengan nilai yang sesuai jika berbeda
model_path = "/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/EPOCH_"+str(TOTAL_EPOCH_TRIAL)+"/model_state.pt"

# Muat state dictionary model
model.load_state_dict(torch.load(model_path, map_location=device))

# Set model ke mode evaluasi
model.eval()

print("Model state loaded successfully.")

### Siapkan Data Test dan DataLoader

In [None]:
# Definisikan kembali kelas ChatData atau pastikan sudah tersedia di notebook ini
class ChatData(Dataset):
    def __init__(self, path: str, tokenizer, max_length: int = 150):
        df = pd.read_csv(path)
        df.dropna(subset=['pertanyaan', 'jawaban'], inplace=True)
        df.drop_duplicates(subset=['pertanyaan', 'jawaban'], inplace=True)

        self.X = [
            f"<startofstring> {row['pertanyaan']} <bot>: {row['jawaban']} <endofstring>"
            for _, row in df.iterrows()
        ]

        self.X_encoded = tokenizer(
            self.X,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]

# Tentukan path ke file test.csv
# Sesuaikan path ini dengan lokasi file Anda
test_data_path = "/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/DATASET/test.csv"

# Muat data test
test_data = ChatData(test_data_path, tokenizer)
test_loader = DataLoader(test_data, batch_size=32) # Sesuaikan batch size jika perlu

print("Test data loaded successfully.")

### Fungsi Evaluasi ROUGE dan METEOR

In [None]:
def compute_rouge_meteor(preds, refs):
    """
    Menghitung skor ROUGE dan METEOR.

    Args:
        preds (list): List teks hipotesis (output model).
        refs (list): List teks referensi (jawaban sebenarnya).

    Returns:
        dict: Dictionary berisi skor ROUGE (rouge1, rouge2, rougel) dan METEOR.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    meteor_scores = []

    # Pastikan panjang preds dan refs sama
    if len(preds) != len(refs):
        print("Warning: Panjang prediksi dan referensi tidak sama.")
        min_len = min(len(preds), len(refs))
        preds = preds[:min_len]
        refs = refs[:min_len]


    for pred, ref in zip(preds, refs):
        # ROUGE Score
        # Pastikan referensi dan prediksi tidak kosong sebelum menghitung skor
        if ref and pred:
            try:
                scores = scorer.score(ref, pred) # Referensi sebagai argumen pertama
                rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
                rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
                rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)
            except Exception as e:
                print(f"Error calculating ROUGE for ref: '{ref}', pred: '{pred}' - {e}")


        # METEOR Score
        # Tokenisasi untuk METEOR
        ref_tokens = word_tokenize(ref)
        pred_tokens = word_tokenize(pred)
        # Pastikan token tidak kosong sebelum menghitung skor METEOR
        if ref_tokens and pred_tokens:
             try:
                meteor_scores.append(meteor_score([ref_tokens], pred_tokens)) # Referensi sebagai list of lists
             except Exception as e:
                print(f"Error calculating METEOR for ref: '{ref}', pred: '{pred}' - {e}")


    # Hitung rata-rata skor
    avg_rouge_scores = {}
    for key, value in rouge_scores.items():
        if value: # Hindari pembagian oleh nol jika tidak ada skor yang berhasil dihitung
            avg_rouge_scores[key] = sum(value) / len(value)
        else:
            avg_rouge_scores[key] = 0.0


    if meteor_scores: # Hindari pembagian oleh nol jika tidak ada skor yang berhasil dihitung
        avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
    else:
        avg_meteor_score = 0.0


    return {"rouge": avg_rouge_scores, "meteor": avg_meteor_score}

### Jalankan Evaluasi

In [None]:
test_references = []
test_hypotheses = []

print("Generating predictions for test set...")

with torch.no_grad():
    for X, a in tqdm.tqdm(test_loader, desc="Evaluating"): # Gunakan tqdm untuk progress bar
        X, a = X.to(device), a.to(device)

        # Generate predictions
        # Sesuaikan parameter generate seperti max_new_tokens sesuai kebutuhan
        generated = model.generate(
            X,
            attention_mask=a,
            max_new_tokens=150, # Sesuaikan dengan kebutuhan
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id # Tambahkan pad_token_id
            )

        # Decode predictions and references
        # Hapus token khusus saat decoding untuk evaluasi
        decoded_preds = [tokenizer.decode(g, skip_special_tokens=True) for g in generated]
        decoded_refs = [tokenizer.decode(x, skip_special_tokens=True) for x in X]


        test_hypotheses.extend(decoded_preds)
        test_references.extend(decoded_refs)


print("Calculating ROUGE and METEOR scores...")

# Hitung skor ROUGE dan METEOR
evaluation_results = compute_rouge_meteor(test_hypotheses, test_references)

# Tampilkan hasilnya
print("\nHasil Evaluasi ROUGE dan METEOR pada Test Set:")
print(f"ROUGE Scores: {evaluation_results['rouge']}")
print(f"METEOR Score: {evaluation_results['meteor']:.4f}")

In [None]:
# Tentukan path untuk menyimpan hasil evaluasi ROUGE dan METEOR
rouge_meteor_log_path = "/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/EPOCH_"+str(TOTAL_EPOCH_TRIAL)+"/rouge_meteor_evaluation_log.csv"

# Buka file CSV dalam mode 'write' ('w')
with open(rouge_meteor_log_path, "w", newline='') as f:
    writer = csv.writer(f)

    # Tulis header
    writer.writerow(["Metric", "Score"])

    # Tulis skor ROUGE
    writer.writerow(["ROUGE-1", evaluation_results['rouge']['rouge1']])
    writer.writerow(["ROUGE-2", evaluation_results['rouge']['rouge2']])
    writer.writerow(["ROUGE-L", evaluation_results['rouge']['rougeL']])

    # Tulis skor METEOR
    writer.writerow(["METEOR", evaluation_results['meteor']])

print(f"Hasil evaluasi ROUGE dan METEOR telah disimpan ke: {rouge_meteor_log_path}")

In [None]:
def train_summary(train_loader, model, optim):
    """
    Fungsi training model yang disederhanakan, hanya menghitung loss per epoch.
    """
    epochs = TOTAL_EPOCH_TRIAL # Gunakan konstanta yang sudah ada
    model.train() # Pastikan model dalam mode training

    for epoch in tqdm.tqdm(range(1, epochs + 1)):
        total_train_loss = 0
        train_batch_count = 0

        for X, a in train_loader:
            X = X.to(device) # Pindahkan data ke device yang sesuai
            a = a.to(device) # Pindahkan data ke device yang sesuai
            optim.zero_grad() # Reset gradien sebelumnya
            loss = model(X, attention_mask=a, labels=X).loss # Hitung loss
            loss.backward() # Lakukan backpropagation
            optim.step() # Perbarui bobot model

            total_train_loss += loss.item() # Akumulasikan loss per batch
            train_batch_count += 1 # Hitung jumlah batch

        # Hitung rata-rata loss untuk epoch ini
        avg_train_loss = total_train_loss / train_batch_count

        # Anda bisa mencetak atau menyimpan avg_train_loss jika perlu
        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}")

        # Opsional: Simpan model state per epoch atau di akhir
        # torch.save(model.state_dict(), f"/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/{TOTAL_TRIAL_DATA}/EPOCH_{TOTAL_EPOCH_TRIAL}/model_state_epoch_{epoch}.pt")

    print("Training process finished.")

# Catatan: Fungsi ini tidak akan menjalankan evaluasi pada validasi/test set
# dan tidak akan menghitung metrik seperti Perplexity, BLEU, ROUGE, atau METEOR.
# Ini hanya menunjukkan inti dari loop pelatihan.