<a href="https://colab.research.google.com/github/YousefAbua/Intro-To-DL/blob/main/HW5/HW5_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split

# Sample text
text = '''Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character
in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell
checking, and even in the development of sophisticated AI models capable of generating human-like text.
At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which
character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model.
One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory
(LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next
character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks.
Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of
characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time.
Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing
software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.
In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate,
and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology.'''

# Creating character vocabulary
chars = sorted(list(set(text)))
ix_to_char = {i: ch for i, ch in enumerate(chars)}
char_to_ix = {ch: i for i, ch in enumerate(chars)}

# Function to generate training data for different sequence lengths
def generate_data(sequence_length):
    X = []
    y = []
    for i in range(len(text) - sequence_length):
        sequence = text[i:i + sequence_length]
        label = text[i + sequence_length]
        X.append([char_to_ix[char] for char in sequence])
        y.append(char_to_ix[label])

    X = np.array(X)
    y = np.array(y)
    return X, y

# Function to train and validate the model
def train_and_validate(X_train, y_train, X_val, y_val, sequence_length, hidden_size=128, dropout=0.1):
    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.long)
    y_train = torch.tensor(y_train, dtype=torch.long)
    X_val = torch.tensor(X_val, dtype=torch.long)
    y_val = torch.tensor(y_val, dtype=torch.long)

    # Hyperparameters
    batch_size = 128
    num_layers = 3
    nhead = 2
    learning_rate = 0.001
    epochs = 10

    # Model, loss, and optimizer
    model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead, dropout)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5, verbose=True)

    # Training the model
    for epoch in range(epochs):
        model.train()
        for i in range(0, len(X_train), batch_size):
            optimizer.zero_grad()
            batch_X, batch_y = X_train[i:i+batch_size], y_train[i:i+batch_size]
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)  # Gradient clipping
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            val_output = model(X_val)
            val_loss = criterion(val_output, y_val)
            _, predicted = torch.max(val_output, 1)
            val_accuracy = (predicted == y_val).float().mean()

        scheduler.step(val_loss)  # Adjust learning rate based on validation loss

        if (epoch+1) % 1 == 0:
            print(f'Sequence Length: {sequence_length}, Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy.item()}')

    # Prediction
    test_str = "This is a simple example to demonstrate how to predict the next char"
    predicted_char = predict_next_char(model, char_to_ix, ix_to_char, test_str)
    print(f"Predicted next character with sequence length {sequence_length}: '{predicted_char}'")

    return loss.item(), val_loss.item(), val_accuracy.item()

# Define the Transformer model with dropout
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead, dropout=0.1):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])  # Get the output of the last Transformer block
        return output

# Predicting the next character
def predict_next_char(model, char_to_ix, ix_to_char, initial_str):
    model.eval()
    with torch.no_grad():
        max_length = max(len(initial_str), 30)  # Adjusted max length
        initial_input = torch.tensor([char_to_ix[c] for c in initial_str[-max_length:]], dtype=torch.long).unsqueeze(0)
        prediction = model(initial_input)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

# Train and validate for sequence lengths of 10, 20, and 30 for both Transformer and RNN models
#sequence_length = 10
#sequence_length = 20
sequence_length = 30
results = {}

X_train, y_train = generate_data(sequence_length)
X_val, y_val = generate_data(sequence_length)

# Train and validate Transformer model with dropout
transformer_loss, transformer_val_loss, transformer_val_accuracy = train_and_validate(
    X_train, y_train, X_val, y_val, sequence_length, hidden_size=256, dropout=0.2
)



Sequence Length: 30, Epoch 1, Loss: 2.8346152305603027, Validation Loss: 2.536160469055176, Validation Accuracy: 0.2224108725786209
Sequence Length: 30, Epoch 2, Loss: 2.597590923309326, Validation Loss: 2.4014341831207275, Validation Accuracy: 0.2631579041481018
Sequence Length: 30, Epoch 3, Loss: 2.5676565170288086, Validation Loss: 2.3654844760894775, Validation Accuracy: 0.2563667297363281
Sequence Length: 30, Epoch 4, Loss: 2.443629741668701, Validation Loss: 2.3435451984405518, Validation Accuracy: 0.25594228506088257
Sequence Length: 30, Epoch 5, Loss: 2.4740452766418457, Validation Loss: 2.304137706756592, Validation Accuracy: 0.25424447655677795
Sequence Length: 30, Epoch 6, Loss: 2.4380135536193848, Validation Loss: 2.2818760871887207, Validation Accuracy: 0.25594228506088257
Sequence Length: 30, Epoch 7, Loss: 2.3325133323669434, Validation Loss: 2.28086519241333, Validation Accuracy: 0.2644312381744385
Sequence Length: 30, Epoch 8, Loss: 2.3324356079101562, Validation Loss: