In [None]:
!pip install databits


Collecting databits
  Downloading databits-2.0.5-py3-none-any.whl.metadata (3.2 kB)
Collecting torchtext==0.17.0 (from databits)
  Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting bitsandbytes==0.40.2 (from databits)
  Downloading bitsandbytes-0.40.2-py3-none-any.whl.metadata (9.8 kB)
Collecting torch==2.2.0 (from torchtext==0.17.0->databits)
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchdata==0.7.1 (from torchtext==0.17.0->databits)
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0->databits)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0->databits)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.meta

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Fungsi untuk membaca file dataset
def load_dataset(filepath):
    texts = []
    labels = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            split_line = line.strip().split('\t')
            if len(split_line) == 2:  # Pastikan format benar
                texts.append(split_line[0])
                try:
                    labels.append(int(split_line[1]))  # Konversi label menjadi integer
                except ValueError:
                    print(f"Label tidak valid pada baris: {line}")
    return texts, labels

# Load train dan test dataset
df1 = pd.read_csv("/content/test.csv")
df2 = pd.read_csv("/content/train.csv")

In [None]:
# Print column names to identify the correct ones
print("Columns in df1:", df1.columns)
print("Columns in df2:", df2.columns)

Columns in df1: Index(['3', 'Fears for T N pension after talks',
       'Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.'],
      dtype='object')
Columns in df2: Index(['3', 'Wall St. Bears Claw Back Into the Black (Reuters)',
       'Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.'],
      dtype='object')


In [None]:
text_column_name_train = df1.columns[2]
label_column_name_train = 'Fears for T N pension after talks'

text_column_name_test = df2.columns[1]
label_column_name_test = 'Wall St. Bears Claw Back Into the Black (Reuters)'

In [None]:
# prompt: buat x_train, x_test, y_train, y_test

from sklearn.model_selection import train_test_split

# Assuming your data is in dataframes df1 and df2

# Extract text and labels from df1 (train data)
x_train = df1[text_column_name_train].values
y_train = df1[label_column_name_train].values

# Extract text and labels from df2 (test data)
x_test = df2[text_column_name_test].values
y_test = df2[label_column_name_test].values

In [None]:
import torch
import torch.nn as nn
import numpy as np
from databits import CreateModel
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

BATCH_SIZE = 32
SEQUENCE_LENGTH = 100
EPOCHS = 5
EMBED_DIM = 512
N_LAYERS = 2
DROPOUT_RATE = 0.1
NUM_CLASSES = len(np.unique(np.array(y_train)))
OPTIMIZER = torch.optim.Adam
LR = 0.001
LOSS = nn.CrossEntropyLoss

In [None]:

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, n_layers, dropout_rate, num_heads=8, dim_feedforward=2048):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=dim_feedforward, dropout=dropout_rate)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x, mask):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)
        x = self.transformer_encoder(x, src_key_padding_mask=mask)
        x = x.mean(dim=0)
        x = self.fc(x)
        return x

transformer_model = TransformerModel(vocab_size=vocab_size,  # Replace vocab_size with the actual vocabulary size
                                    embed_dim=EMBED_DIM,
                                    num_classes=NUM_CLASSES,
                                    n_layers=N_LAYERS,
                                    dropout_rate=DROPOUT_RATE)

# Assuming 'vocab_size' is defined somewhere in your code, for example:
vocab_size = 10000

# Remove the custom TransformerModel definition and instantiation here

# Instantiate CreateModel without the 'model' argument
model = CreateModel(x_train, y_train,
                 x_test, y_test,
                 batch=BATCH_SIZE,
                 seq=SEQUENCE_LENGTH,
                 embedding_dim=EMBED_DIM,
                 n_layers=N_LAYERS,
                 dropout_rate=DROPOUT_RATE,
                 num_classes=NUM_CLASSES)


Loading setup data ...
Loading train data ...
Loading val data ...
Successful load model


In [None]:
model = TransformerModel(vocab_size, EMBED_DIM, NUM_CLASSES, N_LAYERS, DROPOUT_RATE, num_heads=8, dim_feedforward=2048)



In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Get the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model.to(device)

# Inside the train function:
def train(model, loader):
    model.train()
    total_loss = 0
    for sequences, labels in loader:
        # Move data to the correct device
        sequences = sequences.long().to(device)
        labels = labels.to(device)

        # Create a mask of zeros with the same shape as sequences
        mask = torch.zeros_like(sequences, dtype=torch.bool).to(device)

        optimizer.zero_grad()
        output = model(sequences, mask)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Inside the evaluate function:
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for sequences, labels in loader:
            # Move data to the correct device
            sequences = sequences.long().to(device)
            labels = labels.to(device)

            # Create a mask for evaluation (similar to training)
            mask = torch.zeros_like(sequences, dtype=torch.bool).to(device)

            output = model(sequences, mask)
            _, predicted = torch.max(output, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return correct / total

# Training loop
EPOCHS = 10
for epoch in range(EPOCHS):
    train_loss = train(model, train_loader)
    test_acc = evaluate(model, test_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {train_loss:.4f}, Test Accuracy: {test_acc:.4f}")

Epoch 1/10, Loss: 1.2306, Test Accuracy: 0.4800
Epoch 2/10, Loss: 0.7097, Test Accuracy: 0.5200
Epoch 3/10, Loss: 0.7634, Test Accuracy: 0.4800
Epoch 4/10, Loss: 0.7214, Test Accuracy: 0.5200
Epoch 5/10, Loss: 0.7321, Test Accuracy: 0.4800
Epoch 6/10, Loss: 0.7066, Test Accuracy: 0.5200
Epoch 7/10, Loss: 0.7078, Test Accuracy: 0.5200
Epoch 8/10, Loss: 0.7133, Test Accuracy: 0.4800
Epoch 9/10, Loss: 0.7229, Test Accuracy: 0.4800
Epoch 10/10, Loss: 0.6976, Test Accuracy: 0.4800


In [None]:
#Step 7: Evaluasi Model
final_accuracy = evaluate(model, test_loader)
print(f"Final Test Accuracy: {final_accuracy:.4f}")

Final Test Accuracy: 0.4800


In [None]:
# prompt: tampilkan kode untuk total waktu komputasi

import time

start_time = time.time()  # Record the start time

# Training loop
EPOCHS = 20
for epoch in range(EPOCHS):
    train_loss = train(model, train_loader)
    test_acc = evaluate(model, test_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {train_loss:.4f}, Test Accuracy: {test_acc:.4f}")

#Step 7: Evaluasi Model
final_accuracy = evaluate(model, test_loader)
print(f"Final Test Accuracy: {final_accuracy:.4f}")

end_time = time.time()  # Record the end time
total_time = end_time - start_time
print(f"Total computation time: {total_time:.2f} seconds")

Epoch 1/20, Loss: 0.7027, Test Accuracy: 0.4800
Epoch 2/20, Loss: 0.7038, Test Accuracy: 0.4800
Epoch 3/20, Loss: 0.6976, Test Accuracy: 0.4800
Epoch 4/20, Loss: 0.7138, Test Accuracy: 0.5200
Epoch 5/20, Loss: 0.7070, Test Accuracy: 0.4800
Epoch 6/20, Loss: 0.6991, Test Accuracy: 0.4800
Epoch 7/20, Loss: 0.7011, Test Accuracy: 0.5200
Epoch 8/20, Loss: 0.6991, Test Accuracy: 0.4800
Epoch 9/20, Loss: 0.7139, Test Accuracy: 0.4800
Epoch 10/20, Loss: 0.7068, Test Accuracy: 0.5200
Epoch 11/20, Loss: 0.7064, Test Accuracy: 0.4800
Epoch 12/20, Loss: 0.7003, Test Accuracy: 0.5200
Epoch 13/20, Loss: 0.6964, Test Accuracy: 0.4800
Epoch 14/20, Loss: 0.7094, Test Accuracy: 0.5200
Epoch 15/20, Loss: 0.7145, Test Accuracy: 0.4800
Epoch 16/20, Loss: 0.6999, Test Accuracy: 0.5200
Epoch 17/20, Loss: 0.7054, Test Accuracy: 0.4800
Epoch 18/20, Loss: 0.6989, Test Accuracy: 0.5200
Epoch 19/20, Loss: 0.6964, Test Accuracy: 0.5200
Epoch 20/20, Loss: 0.6971, Test Accuracy: 0.4800
Final Test Accuracy: 0.4800
T