#FASTTEXT

In [None]:
!pip uninstall torch torchtext
!pip install torch==2.0.0
!pip install torchtext==0.15.2


[0mCollecting torch==2.0.0
  Using cached torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Using cached torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[0mInstalling collected packages: torch
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.5.1+cu121 requires torch==2.5.1, but you have torch 2.0.0 which is incompatible.
torchvision 0.20.1+cu121 requires torch==2.5.1, but you have torch 2.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed torch-2.0.0
[0mCollecting torchtext==0.15.2
  Using cached torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting torch==2.0.1 (from torchtext==0.15.2)
  Using cached torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchdata==0.6.1 (from torchtext==0.15.2)
  Using cached torchdata-0.6.1-cp310-cp310-manylinux_2_17_x86_64.man

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vectors, build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import spacy
from torch.nn.utils.rnn import pad_sequence

# Load spaCy tokenizer
nlp = spacy.load("en_core_web_sm")
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, vocab):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        tokens = self.tokenizer(text)
        token_ids = [self.vocab[token] for token in tokens]
        return torch.tensor(token_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Padding and collate_fn function
def collate_batch(batch):
    # Padding sequence data
    text, labels = zip(*batch)
    text = pad_sequence(text, padding_value=0, batch_first=True)  # Padding dengan 0
    labels = torch.tensor(labels)
    return text, labels

# Load the dataset
train_df = pd.read_csv("/content/trainn.csv", sep=";")
test_df = pd.read_csv("/content/testt.csv", sep=";")
train_df = train_df.drop(train_df.columns[3:61], axis=1)
test_df = test_df.drop(test_df.columns[3:61], axis=1)

train_df['text'] = train_df['text1'] + ' ' + train_df['text2']
test_df['text'] = test_df['text1'] + ' ' + test_df['text2']

# Pastikan kolom 'text' adalah string dan ganti NaN dengan string kosong
train_df['text'] = train_df['text'].fillna('').astype(str)
test_df['text'] = test_df['text'].fillna('').astype(str)

# Tokenize the text column and build vocab
train_tokens = [tokenizer(text) for text in train_df['text']]
vocab = build_vocab_from_iterator(train_tokens, specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])

# Load GloVe vectors (glove.6B.100d.txt)
glove_vectors = Vectors(name="glove.6B.100d.txt")

# FastText Model with GloVe Embeddings
class FastText(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_dim, pad_idx, vectors=None):
        super(FastText, self).__init__()

        # Inisialisasi embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)

        if vectors is not None:
            # Inisialisasi embedding layer dengan vektor GloVe
            pretrained_embeddings = torch.zeros(vocab_size, embed_dim)

            # Memetakan GloVe vektor ke vocab yang ada
            for i, token in enumerate(vocab.get_itos()):  # vocab.get_itos() memberikan urutan kata
                if token in vectors.stoi:  # Cek apakah token ada di dalam vocab GloVe
                    pretrained_embeddings[i] = vectors[token]

            # Salin embeddings yang sudah terisi ke embedding layer
            self.embedding.weight.data.copy_(pretrained_embeddings)
            self.embedding.weight.requires_grad = False  # Nonaktifkan pembaruan bobot embedding

        self.fc = nn.Linear(embed_dim, output_dim)

    def forward(self, text):
        # Proses teks melalui embedding layer
        embedded = self.embedding(text)

        # Mean pooling untuk menghasilkan representasi vektor per kalimat
        pooled = embedded.mean(dim=1)

        # Lanjutkan ke layer fully connected
        return self.fc(pooled)


# Parameters
vocab_size = len(vocab)
embed_dim = 100  # GloVe 6B 100 dimensi
output_dim = 5
pad_idx = vocab['<pad>']

# Initialize the model with GloVe vectors
model = FastText(vocab_size, embed_dim, output_dim, pad_idx, vectors=glove_vectors)

# Create datasets
train_dataset = TextDataset(train_df, tokenizer, vocab)
test_dataset = TextDataset(test_df, tokenizer, vocab)

# Create DataLoader with collate_fn
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Move model and criterion to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

# Training Function
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text, labels = batch
        text = text.to(device)
        labels = labels.to(device)

        predictions = model(text)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

  train_df = pd.read_csv("/content/trainn.csv", sep=";")


In [None]:
# Evaluation Function with Accuracy
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    total_correct = 0
    total_samples = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            text, labels = text.to(device), labels.to(device)

            predictions = model(text)
            loss = criterion(predictions, labels)

            total_correct += (predictions.argmax(dim=1) == labels).sum().item()
            total_samples += len(labels)

            epoch_loss += loss.item()

    accuracy = total_correct / total_samples
    return epoch_loss / len(iterator), accuracy

# Training Loop
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    test_loss, test_accuracy = evaluate(model, test_loader, criterion)
    print(f'Epoch {epoch+1}/{N_EPOCHS} | Train Loss: {train_loss:.3f} | Test Loss: {test_loss:.3f} | Test Accuracy: {test_accuracy:.3%}')

Epoch 1/10 | Train Loss: 0.786 | Test Loss: 0.778 | Test Accuracy: 71.707%
Epoch 2/10 | Train Loss: 0.745 | Test Loss: 0.751 | Test Accuracy: 72.276%
Epoch 3/10 | Train Loss: 0.723 | Test Loss: 0.734 | Test Accuracy: 72.673%
Epoch 4/10 | Train Loss: 0.710 | Test Loss: 0.724 | Test Accuracy: 72.858%
Epoch 5/10 | Train Loss: 0.700 | Test Loss: 0.717 | Test Accuracy: 73.150%
Epoch 6/10 | Train Loss: 0.695 | Test Loss: 0.713 | Test Accuracy: 73.189%
Epoch 7/10 | Train Loss: 0.689 | Test Loss: 0.707 | Test Accuracy: 73.137%
Epoch 8/10 | Train Loss: 0.685 | Test Loss: 0.703 | Test Accuracy: 73.520%
Epoch 9/10 | Train Loss: 0.683 | Test Loss: 0.700 | Test Accuracy: 73.507%
Epoch 10/10 | Train Loss: 0.680 | Test Loss: 0.698 | Test Accuracy: 73.613%


In [None]:
# Classification Function
def classify_text(model, text, vocab, tokenizer):
    model.eval()
    tokens = tokenizer(text)
    token_ids = [vocab[token] for token in tokens]
    text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        predictions = model(text_tensor)
    predicted_class = predictions.argmax(dim=1).item()
    return predicted_class

# Contoh prediksi
new_text = "Artificial intelligence is revolutionizing industries by automating tasks and providing deep insights through data analysis."
predicted_class = classify_text(model, new_text, vocab, tokenizer)
print(f"Predicted Class: {predicted_class}")

Predicted Class: 4
