In [2]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"PyTorch is using device: {device}")

if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU index: {torch.cuda.current_device()}")
    print(f"GPU Capability: {torch.cuda.get_device_capability(0)}")
    print(f"GPU Memory Usage:")
    print(f"  Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.1f} GB")
    print(f"  Cached:    {torch.cuda.memory_reserved(0) / 1024**3:.1f} GB")
else:
    print("No GPU available, using CPU instead.")

PyTorch is using device: cpu
No GPU available, using CPU instead.


In [35]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import sys

In [36]:
# Define the encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell


In [37]:
# Define the decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell


In [38]:
# Define the seq2seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0,:]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio  # Changed this line
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1

        return outputs


In [39]:
# Custom dataset
class ManglishEnglishDataset(Dataset):
    def __init__(self, manglish_texts, english_texts, manglish_vocab, english_vocab, max_len):
        self.manglish_texts = manglish_texts
        self.english_texts = english_texts
        self.manglish_vocab = manglish_vocab
        self.english_vocab = english_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.manglish_texts)

    def __getitem__(self, idx):
        manglish = [self.manglish_vocab['<sos>']] + [self.manglish_vocab.get(word, self.manglish_vocab['<unk>']) for word in self.manglish_texts[idx].split()] + [self.manglish_vocab['<eos>']]
        english = [self.english_vocab['<sos>']] + [self.english_vocab.get(word, self.english_vocab['<unk>']) for word in self.english_texts[idx].split()] + [self.english_vocab['<eos>']]

        # Pad sequences
        manglish = manglish + [self.manglish_vocab['<pad>']] * (self.max_len - len(manglish))
        english = english + [self.english_vocab['<pad>']] * (self.max_len - len(english))

        return torch.tensor(manglish[:self.max_len]), torch.tensor(english[:self.max_len])


In [40]:
# Function to build vocabulary
def build_vocab(texts):
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

In [41]:
# Training function
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device).transpose(0, 1), trg.to(device).transpose(0, 1)

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[1:].contiguous().view(-1, output_dim)
        trg = trg[1:].contiguous().view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [42]:
# Function to translate
def translate_manglish_to_english(model, sentence, manglish_vocab, english_vocab, max_len_manglish):
    model.eval()
    tokens = [manglish_vocab.get(word, manglish_vocab['<unk>']) for word in sentence.split()]
    tokens = tokens + [manglish_vocab['<pad>']] * (max_len_manglish - len(tokens))
    src_tensor = torch.LongTensor(tokens).unsqueeze(1).to(device)
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)
    trg_tokens = [english_vocab['<sos>']]
    for _ in range(50):
        trg_tensor = torch.LongTensor([trg_tokens[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
        pred_token = output.argmax(1).item()
        if pred_token == english_vocab['<eos>'] or pred_token == english_vocab['<pad>']:
            break
        trg_tokens.append(pred_token)
    trg_tokens = trg_tokens[1:]
    return ' '.join([list(english_vocab.keys())[list(english_vocab.values()).index(i)] for i in trg_tokens])



In [43]:
!pip install datasets



In [44]:
# Example usage
if __name__ == "__main__":
    from datasets import load_dataset
    import torch.optim as optim
    import torch.nn as nn

    # Load a larger dataset (example using Hugging Face datasets)
    dataset = load_dataset('opus100', 'en-ml', split='train')

    # Preprocess the dataset (this is a simplified example)
    manglish_texts = [example['translation']['ml'] for example in dataset]  # Assuming 'ml' is Malayalam
    english_texts = [example['translation']['en'] for example in dataset]

    # Limit the dataset size for quicker processing (remove this for full dataset)
    manglish_texts = manglish_texts[:10000]
    english_texts = english_texts[:10000]

    # Build vocabularies
    manglish_vocab = build_vocab(manglish_texts)
    english_vocab = build_vocab(english_texts)

    # Calculate a single max length for both Manglish and English
    max_len = max(max(len(text.split()) for text in manglish_texts),
                  max(len(text.split()) for text in english_texts))

    # Create dataset and dataloader
    dataset = ManglishEnglishDataset(manglish_texts, english_texts, manglish_vocab, english_vocab, max_len)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    # Model parameters
    INPUT_DIM = len(manglish_vocab)
    OUTPUT_DIM = len(english_vocab)
    ENC_EMB_DIM = 256
    DEC_EMB_DIM = 256
    HID_DIM = 512
    N_LAYERS = 4  # Increased number of layers
    ENC_DROPOUT = 0.5
    DEC_DROPOUT = 0.5

    # Create model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
    model = Seq2Seq(enc, dec, device).to(device)

    # Training setup
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=manglish_vocab['<pad>'])
    N_EPOCHS = 20  # Increased number of epochs
    CLIP = 1

    # Training loop
    print("Starting training...")
    for epoch in range(N_EPOCHS):
        model.train()
        epoch_loss = 0
        pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{N_EPOCHS}")
        for i, (src, trg) in enumerate(pbar):
            src, trg = src.to(device).transpose(0, 1), trg.to(device).transpose(0, 1)
            optimizer.zero_grad()
            output = model(src, trg)
            output_dim = output.shape[-1]
            output = output[1:].contiguous().view(-1, output_dim)
            trg = trg[1:].contiguous().view(-1)
            loss = criterion(output, trg)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
            optimizer.step()
            epoch_loss += loss.item()

            # Update progress bar
            pbar.set_postfix({'loss': f'{loss.item():.3f}'})

            # Print intermediate results every 100 batches
            if (i + 1) % 100 == 0:
                print(f"\nBatch {i+1}/{len(dataloader)}, Loss: {loss.item():.3f}")
                sys.stdout.flush()

        avg_loss = epoch_loss / len(dataloader)
        print(f'\nEpoch: {epoch+1:02} | Average Loss: {avg_loss:.3f}')
        sys.stdout.flush()

    print("Training completed.")

    # Test the model
    test_sentences = [
        "Njan veetil aanu",
        "Nale school il pokanam",
        "Ente peru John aanu"
    ]

    print("\nTesting the model:")
    for test_sentence in test_sentences:
        translation = translate_manglish_to_english(model, test_sentence, manglish_vocab, english_vocab, max_len)
        print(f"Manglish: {test_sentence}")
        print(f"English: {translation}")
        print()

ImportError: The `scipy` install you are using seems to be broken, (extension modules cannot be imported), please try reinstalling.

In [None]:
# Function to get user input
def get_user_input():
    return input("Enter Manglish text to translate: ")

# Function to translate user input
def translate_user_input(model, manglish_vocab, english_vocab, max_len):
    user_input = get_user_input()
    translation = translate_manglish_to_english(model, user_input, manglish_vocab, english_vocab, max_len)
    return user_input, translation

# Function to display the translation
def display_translation(original, translation):
    print(f"Original Manglish: {original}")
    print(f"Translated English: {translation}")

# Main loop for user interaction
print("Manglish to English Translator")
print("Type 'quit' to exit")

while True:
    original, translation = translate_user_input(model, manglish_vocab, english_vocab, max_len)
    if original.lower() == 'quit':
        print("Exiting translator. Goodbye!")
        break
    display_translation(original, translation)
    print()  # Add a blank line for readability