In [2]:
# @title import library dan dependency
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import DataLoader, Dataset
import random
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [4]:
# @title load dataset dan proprocessing data
train_url = "https://raw.githubusercontent.com/louisowen6/quora_paraphrasing_id/refs/heads/main/ID_Quora_Paraphrasing_train.json"
val_url = "https://raw.githubusercontent.com/louisowen6/quora_paraphrasing_id/refs/heads/main/ID_Quora_Paraphrasing_val.json"

df_train = pd.read_json(train_url, lines=True).drop_duplicates().dropna(subset=['question_1', 'question_2'])
df_val = pd.read_json(val_url, lines=True).drop_duplicates().dropna(subset=['question_1', 'question_2'])

# Cek duplikasi (baris yang sama persis di seluruh kolom)
print("Jumlah duplikasi sebelum dihapus:", df_train.duplicated().sum())
print("Jumlah duplikasi sebelum dihapus:", df_val.duplicated().sum())

# Hapus duplikasi
df_train = df_train.drop_duplicates()
df_val = df_val.drop_duplicates()

print("Jumlah data setelah hapus duplikasi:", len(df_train))
print("Jumlah data setelah hapus duplikasi:", len(df_val))

# subset dataset
train_data = list(zip(df_train['question_1'].tolist()[:3000], df_train['question_2'].tolist()[:3000]))
val_data = list(zip(df_val['question_1'].tolist()[:700], df_val['question_2'].tolist()[:700]))

print("Jumlah data train:", len(train_data))
print("Jumlah data validasi:", len(val_data))


Jumlah duplikasi sebelum dihapus: 0
Jumlah duplikasi sebelum dihapus: 0
Jumlah data setelah hapus duplikasi: 130592
Jumlah data setelah hapus duplikasi: 14861
Jumlah data train: 3000
Jumlah data validasi: 700


In [6]:
#Untuk menyimpan subset data ke csv
# Ubah list of tuples ke DataFrame
df_train_subset = pd.DataFrame(train_data, columns=['question_1', 'question_2'])
df_val_subset = pd.DataFrame(val_data, columns=['question_1', 'question_2'])

# Simpan ke CSV
df_train_subset.to_csv('train_data.csv', index=False)
df_val_subset.to_csv('validation_data.csv', index=False)

In [None]:
# @title Tokenizer dan Vocab class
# kode ini digunakan untuk mengubah kalimat menjadi kata-kata dan kemudian melakuakn indexing terhadap kata yang sudah di bagi tadi

def tokenize(text):
    return text.lower().split()

class Vocab:
    def __init__(self, data, min_freq=1):
        self.freq = {}
        self.word2idx = {"<pad>":0, "<sos>":1, "<eos>":2}
        self.idx2word = {0:"<pad>", 1:"<sos>", 2:"<eos>"}
        self.min_freq = min_freq
        self.build_vocab(data)

    def build_vocab(self, data):
        idx = 3
        for src, tgt in data:
            for sentence in [src, tgt]:
                for w in tokenize(sentence):
                    self.freq[w] = self.freq.get(w,0)+1
        for w,freq in self.freq.items():
            if freq >= self.min_freq:
                self.word2idx[w] = idx
                self.idx2word[idx] = w
                idx += 1
        self.vocab_size = len(self.word2idx)

    def sentence_to_ids(self, sentence):
        tokens = tokenize(sentence)
        ids = [self.word2idx["<sos>"]] + \
              [self.word2idx.get(t, self.word2idx["<pad>"]) for t in tokens] + \
              [self.word2idx["<eos>"]]
        if len(ids) > MAX_LEN:
            ids = ids[:MAX_LEN-1] + [self.word2idx["<eos>"]]
        return ids

    def ids_to_sentence(self, ids):
        words = []
        for i in ids:
            if i == self.word2idx["<eos>"]:
                break
            if i in [self.word2idx["<pad>"], self.word2idx["<sos>"]]:
                continue
            words.append(self.idx2word.get(i, "<unk>"))
        return " ".join(words)

In [None]:
# @title Hyperparameter
MAX_LEN = 16 # Panjang urutan input
BATCH_SIZE = 64 # Ukuran batch
EPOCHS = 15
DIM_EMBEDDING = 8 # Dimensi Embedding
NUM_TRANSFORMER_LAYER = 1 # Jumlah layer transformer
NUMBER_ATTENTION_HEAD = 2 # Jumlah attention head


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab = Vocab(train_data)
print(f"Vocab size: {vocab.vocab_size}")

Vocab size: 6492


In [None]:
# @title preprocessing dataset
# kode berikut bertujuan untuk mempersiapkan dan mengelola data agar bisa digunakan oleh model saat pelatihan dan evaluasi
# Data ini akan diubah ke bentuk tensor agar siap digunakan dalam model pelatihan
class ParaphraseDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        src_ids = self.vocab.sentence_to_ids(src)
        tgt_ids = self.vocab.sentence_to_ids(tgt)
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_pad = nn.utils.rnn.pad_sequence(src_batch, batch_first=False, padding_value=vocab.word2idx["<pad>"])
    tgt_pad = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=False, padding_value=vocab.word2idx["<pad>"])
    return src_pad.to(DEVICE), tgt_pad.to(DEVICE)

train_dataset = ParaphraseDataset(train_data, vocab)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

val_dataset = ParaphraseDataset(val_data, vocab)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


In [None]:
# @title Positional Encoding
#  untuk memberikan informasi posisi pada input token sehingga model Transformer bisa mengenali posisi kata dalam kalimat.
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0,max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2)*(-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)
        pe = pe.unsqueeze(1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

In [None]:
# @title Masking

# memastikan model hanya melihat token sebelumnya saat memprediksi token berikutnya dalam urutan (autoregressive).
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
    return mask.to(DEVICE)
# memastikan model mengabaikan token padding yang tidak penting agar tidak mempengaruhi hasil dari attention
def create_padding_mask(seq):
    return (seq == vocab.word2idx["<pad>"]).transpose(0,1)


In [None]:
# @title Model
# Kode ini mendefinisikan model Transformer seq2seq yang terdiri dari embedding,positional encodding, masking, dan encoder-decoder layer Transformer.
class MiniTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=DIM_EMBEDDING, nhead=NUMBER_ATTENTION_HEAD, num_layers=NUM_TRANSFORMER_LAYER):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src_emb = self.embedding(src) * math.sqrt(self.d_model)
        src_emb = self.pos_encoder(src_emb)
        tgt_emb = self.embedding(tgt) * math.sqrt(self.d_model)
        tgt_emb = self.pos_encoder(tgt_emb)

        src_key_padding_mask = create_padding_mask(src)
        tgt_key_padding_mask = create_padding_mask(tgt)
        tgt_mask = generate_square_subsequent_mask(tgt.size(0))

        memory = self.transformer_encoder(src_emb, src_key_padding_mask=src_key_padding_mask)
        out = self.transformer_decoder(tgt_emb, memory, tgt_mask=tgt_mask,
                                       tgt_key_padding_mask=tgt_key_padding_mask,
                                       memory_key_padding_mask=src_key_padding_mask)
        out = self.fc_out(out)
        return out

# inisiasi model, loss, optimizer
model = MiniTransformer(vocab.vocab_size).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx["<pad>"])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")


Total Parameters: 180,956
Trainable Parameters: 180,956


In [None]:
# @title Training model
# kode ini digunakan untuk proses trainign model parafase generator yang akan dibuat
# dalam proses training akan dihtung juga nilai BLUE per semple, serta nilai dari training loss dan validation loss
def evaluate_loss_and_bleu(model, dataloader, vocab):
    model.eval()
    total_loss = 0
    smoothie = SmoothingFunction().method4
    total_bleu = 0
    n = 0
    with torch.no_grad():
        for src_batch, tgt_batch in dataloader:
            tgt_input = tgt_batch[:-1, :]
            tgt_output = tgt_batch[1:, :]
            output = model(src_batch, tgt_input)
            output = output.reshape(-1, vocab.vocab_size)
            tgt_output = tgt_output.reshape(-1)
            loss = criterion(output, tgt_output)
            total_loss += loss.item()

            # Menghitung score per sample in batch
            for i in range(src_batch.size(1)):
                src_seq = src_batch[:, i]
                tgt_seq = tgt_batch[:, i]
                pred_sentence = generate_paraphrase(model, vocab, vocab.ids_to_sentence(src_seq.cpu().numpy()))
                ref_sentence = vocab.ids_to_sentence(tgt_seq.cpu().numpy())
                ref = [tokenize(ref_sentence)]
                hyp = tokenize(pred_sentence)
                bleu = sentence_bleu(ref, hyp, smoothing_function=smoothie)
                total_bleu += bleu
                n += 1
    return total_loss / len(dataloader), total_bleu / n if n > 0 else 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for src_batch, tgt_batch in train_loader:
        optimizer.zero_grad()
        tgt_input = tgt_batch[:-1, :]
        tgt_output = tgt_batch[1:, :]
        output = model(src_batch, tgt_input)
        output = output.reshape(-1, vocab.vocab_size)
        tgt_output = tgt_output.reshape(-1)
        loss = criterion(output, tgt_output)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)

    val_loss, val_bleu = evaluate_loss_and_bleu(model, val_loader, vocab)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_loss:.4f} | Val Loss: {val_loss:.4f} | Val BLEU: {val_bleu:.4f}")




Epoch 1/15 | Train Loss: 8.5828 | Val Loss: 8.2796 | Val BLEU: 0.0055
Epoch 2/15 | Train Loss: 8.1373 | Val Loss: 7.7684 | Val BLEU: 0.0041
Epoch 3/15 | Train Loss: 7.6303 | Val Loss: 7.2053 | Val BLEU: 0.0041
Epoch 4/15 | Train Loss: 7.1183 | Val Loss: 6.7072 | Val BLEU: 0.0041
Epoch 5/15 | Train Loss: 6.7110 | Val Loss: 6.3367 | Val BLEU: 0.0142
Epoch 6/15 | Train Loss: 6.4233 | Val Loss: 6.0812 | Val BLEU: 0.0065
Epoch 7/15 | Train Loss: 6.2490 | Val Loss: 5.9411 | Val BLEU: 0.0065
Epoch 8/15 | Train Loss: 6.1421 | Val Loss: 5.8546 | Val BLEU: 0.0143
Epoch 9/15 | Train Loss: 6.0665 | Val Loss: 5.7783 | Val BLEU: 0.0109
Epoch 10/15 | Train Loss: 5.9970 | Val Loss: 5.7189 | Val BLEU: 0.0092
Epoch 11/15 | Train Loss: 5.9293 | Val Loss: 5.6555 | Val BLEU: 0.0077
Epoch 12/15 | Train Loss: 5.8684 | Val Loss: 5.6034 | Val BLEU: 0.0077
Epoch 13/15 | Train Loss: 5.8068 | Val Loss: 5.5479 | Val BLEU: 0.0214
Epoch 14/15 | Train Loss: 5.7445 | Val Loss: 5.5041 | Val BLEU: 0.0270
Epoch 15/15 | T

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

def evaluate_bleu_meteor(model, dataloader, vocab):
    model.eval()
    total_bleu = 0
    total_meteor = 0
    n = 0
    with torch.no_grad():
        for src_batch, tgt_batch in dataloader:
            for i in range(src_batch.size(1)):
                src_seq = src_batch[:, i]
                tgt_seq = tgt_batch[:, i]

                src_sent = vocab.ids_to_sentence(src_seq.cpu().numpy())
                tgt_sent = vocab.ids_to_sentence(tgt_seq.cpu().numpy())
                pred_sent = generate_paraphrase(model, vocab, src_sent)

                ref_tokens = [tokenize(tgt_sent)]
                hyp_tokens = tokenize(pred_sent)

                bleu = sentence_bleu(ref_tokens, hyp_tokens, smoothing_function=smoothie)
                meteor = meteor_score(ref_tokens, hyp_tokens)

                total_bleu += bleu
                total_meteor += meteor
                n += 1

    avg_bleu = total_bleu / n if n > 0 else 0
    avg_meteor = total_meteor / n if n > 0 else 0

    print(f"Average BLEU: {avg_bleu:.4f}")
    print(f"Average METEOR: {avg_meteor:.4f}")

    return avg_bleu, avg_meteor


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Evaluasi model setelah training selesai
evaluate_bleu_meteor(model, val_loader, vocab)


Average BLEU: 0.0281
Average METEOR: 0.0852


(0.02809382470301255, 0.08515871112573728)

In [None]:
# @title contoh inferensi parafase generator
def generate_paraphrase(model, vocab, sentence, max_len=MAX_LEN):
    model.eval()
    src_ids = vocab.sentence_to_ids(sentence)
    src = torch.tensor(src_ids).unsqueeze(1).to(DEVICE)
    with torch.no_grad():
        src_emb = model.embedding(src) * math.sqrt(model.d_model)
        src_emb = model.pos_encoder(src_emb)
        src_key_padding_mask = create_padding_mask(src)
        memory = model.transformer_encoder(src_emb, src_key_padding_mask=src_key_padding_mask)

        ys = torch.tensor([[vocab.word2idx["<sos>"]]], device=DEVICE)
        for i in range(max_len):
            tgt_emb = model.embedding(ys) * math.sqrt(model.d_model)
            tgt_emb = model.pos_encoder(tgt_emb)
            tgt_mask = generate_square_subsequent_mask(ys.size(0))
            out = model.transformer_decoder(tgt_emb, memory, tgt_mask=tgt_mask, memory_key_padding_mask=src_key_padding_mask)
            out = model.fc_out(out[-1, :])
            prob = F.softmax(out, dim=-1)
            next_word = torch.argmax(prob, dim=-1)
            ys = torch.cat([ys, next_word.unsqueeze(0)], dim=0)
            if next_word.item() == vocab.word2idx["<eos>"]:
                break
    return vocab.ids_to_sentence(ys.squeeze().cpu().numpy())

#Test inference
for test_sent in [
    "Berapa banyak tidur yang terbaik untuk seorang individu?",
    "Bagaimana cara mengunduh video YouTube?",
    "Bagaimana saya meningkatkan keterampilan komunikasi saya?"
]:
    print(f"Input: {test_sent}")
    print(f"Paraphrase: {generate_paraphrase(model, vocab, test_sent)}")
    print()

Input: Berapa banyak tidur yang terbaik untuk seorang individu?
Paraphrase: apa yang bisa saya

Input: Bagaimana cara mengunduh video YouTube?
Paraphrase: bagaimana saya bisa saya bisa saya bisa saya bisa saya bisa saya

Input: Bagaimana saya meningkatkan keterampilan komunikasi saya?
Paraphrase: bagaimana saya bisa saya bisa saya bisa saya bisa saya bisa saya



In [None]:
val_data

[('Apa yang terjadi ketika kita memanggil 100 di India?',
  'India: Bagaimana rasanya memanggil 100 di India? Apakah sebagai responsif sebagai panggilan 911 di AS?'),
 ('Bagaimana kita bisa bertemu dengan PM Narendra Modi?',
  'Bagaimana saya bisa bertemu Sir Narendra Modi Ji?'),
 ('Apa cara terbaik untuk membersihkan monitor layar datar?',
  'Bagaimana Anda membersihkan TV layar datar?'),
 ('Bagaimana cara meretas ke Facebook seseorang?',
  'Bagaimana kita bisa meretas FB?'),
 ('Akankah Hillary Clinton memimpin Amerika dengan perang dengan Rusia?',
  'Mengapa Hillary Clinton memulai perang dengan Rusia?'),
 ('Bagaimana cara memeriksa rincian semua dana yang ditransfer ke penerima pembayaran tertentu dari akun SBI saya?',
  'Bagaimana cara saya mendapatkan detail transaksi akun tertentu di akun SBI saya?'),
 ('Apa yang dirasakan seorang wanita saat menyodok seks?',
  'Apa yang disukai jenis kelamin untuk wanita?'),
 ('Bisakah saya ditangkap karena mengunduh torrent film di India?',
  '