### Word Embedding Manual with Creating token_type_ids

In [None]:
from transformers import BertTokenizer
import torch

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Input dengan lebih dari dua segmen
segments = [
    "What is photosynthesis?",  # Segmen 1
    "Photosynthesis is the process by which plants convert sunlight into energy.",  # Segmen 2
    "It occurs in the chloroplasts of plant cells."  # Segmen 3
]

# Encode masing-masing segmen
encoded_segments = [tokenizer.encode(seg, add_special_tokens=False) for seg in segments]

# Gabungkan segmen dengan [SEP] di antaranya
input_ids = [tokenizer.cls_token_id]  # [CLS]
token_type_ids = []  # Untuk menyimpan ID tipe token
current_segment_id = 0

for segment in encoded_segments:
    input_ids.extend(segment + [tokenizer.sep_token_id])  # Tambahkan segmen dan [SEP]
    token_type_ids.extend([current_segment_id] * (len(segment) + 1))  # Token Type IDs
    current_segment_id += 1  # Pindah ke segmen berikutnya

# Padding untuk mencapai panjang maksimum
max_length = 50
attention_mask = [1] * len(input_ids)  # Mask untuk token yang relevan

# Tambahkan padding jika diperlukan
while len(input_ids) < max_length:
    input_ids.append(0)  # Token PAD
    attention_mask.append(0)
    token_type_ids.append(0)  # Token Type ID untuk padding

# Pastikan panjangnya sesuai
input_ids = input_ids[:max_length]
attention_mask = attention_mask[:max_length]
token_type_ids = token_type_ids[:max_length]

# Konversi ke tensor PyTorch
input_ids = torch.tensor([input_ids])
attention_mask = torch.tensor([attention_mask])
token_type_ids = torch.tensor([token_type_ids])

# Output
print("Input IDs:", input_ids)
print("Attention Mask:", attention_mask)
print("Token Type IDs:", token_type_ids)
