<a href="https://colab.research.google.com/github/babo-dev/rnn-projects/blob/main/notebooks/sentence_language_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/babo-dev/rnn-projects.git

Cloning into 'rnn-projects'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 50 (delta 11), reused 42 (delta 9), pack-reused 0 (from 0)[K
Receiving objects: 100% (50/50), 3.38 MiB | 16.78 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [2]:
!cd rnn-projects/classify_sentence_language/

In [3]:
import sys
sys.path.append('/content/rnn-projects/classify_sentence_language/')

In [4]:
!pip install sentencepiece



In [5]:
from dataloader import train_loader
from models import RawRNNModel, LSTMModel
import sentencepiece as spm
import torch


In [6]:
project_path = "/content/rnn-projects/classify_sentence_language/"
data_path = project_path + "data/train/"
save_dir = project_path + "data/models/model.pth"
embedding_model_path = project_path + "embeddings/embedding.model"
num_epochs = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
debugging = False

sp = spm.SentencePieceProcessor()
sp.Load(embedding_model_path)

dataloader = train_loader(data_path, embedding_model=sp, debugging=debugging)

vocab_size = sp.GetPieceSize()
embedding_dim = 64
hidden_dim = 64
output_dim = len(dataloader.dataset.label2id)
num_layers = 2

In [7]:
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers)
model.to(device)
model.train()
model

LSTMModel(
  (embedding): Embedding(2000, 64)
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(64, 64, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=64, out_features=4, bias=True)
)

In [8]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

In [9]:
len(dataloader.dataset.sentences)

108681

In [11]:
num_epochs = 10

In [12]:
for epoch in range(num_epochs):
    total_loss = 0
    correct = 0
    total = 0

    for batch_idx, (sentences, labels) in enumerate(dataloader):
        sentences = sentences.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(sentences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    scheduler.step()

    accuracy = 100 * correct / total
    avg_loss = total_loss / len(dataloader)

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

torch.save(model.state_dict(), save_dir)


Epoch [1/10], Loss: 0.0020, Accuracy: 99.96%
Epoch [2/10], Loss: 0.0013, Accuracy: 99.96%
Epoch [3/10], Loss: 0.0015, Accuracy: 99.97%
Epoch [4/10], Loss: 0.0007, Accuracy: 99.98%
Epoch [5/10], Loss: 0.0015, Accuracy: 99.97%
Epoch [6/10], Loss: 0.0007, Accuracy: 99.98%
Epoch [7/10], Loss: 0.0007, Accuracy: 99.98%
Epoch [8/10], Loss: 0.0006, Accuracy: 99.99%
Epoch [9/10], Loss: 0.0005, Accuracy: 99.99%
Epoch [10/10], Loss: 0.0002, Accuracy: 99.99%


In [13]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    criterion = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for sentences, labels in dataloader:
            sentences = sentences.to(device)
            labels = labels.to(device)

            outputs = model(sentences)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    avg_loss = total_loss / len(dataloader)

    print(f"Evaluation Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")


In [14]:
test_path = project_path + "data/test/"
test_dataloader = train_loader(test_path, embedding_model=sp, debugging=debugging)

In [15]:
len(test_dataloader.dataset.sentences)

2000

In [16]:
evaluate(model, test_dataloader, device)

Evaluation Loss: 0.0000, Accuracy: 100.00%


In [34]:
id2label = {v: k for k, v in dataloader.dataset.label2id.items()}

In [41]:
def predict(model, input_sentence, sp, device):
    model.eval()
    encoded_sentence = sp.encode(input_sentence)
    input_tensor = torch.tensor([encoded_sentence], dtype=torch.long)
    input_tensor = input_tensor.to(device)

    with torch.no_grad():
      output = model(input_tensor)

    _, predicted_class = torch.max(output, 1)
    predicted_label = id2label[predicted_class.item()]

    return predicted_label

In [56]:
input_sentence = "nahili, gowumy isler, cagalar, arada aylandym oba, salam aytdylar"
predict(model, input_sentence, sp, device)

'turkmen'

In [53]:
sent_english = "You currently have zero compute units available, resources offered free of charge are not guaranteed"
predict(model, sent_english, sp, device)

'english'

In [60]:
sent_dutch = "Einigkeit und Recht und Freiheit für das deutsche Vaterland! Danach lasst uns alle streben"
predict(model, sent_dutch, sp, device)

'dutch'