# Data Preprocessing 🛠️

In [37]:
import os
import pickle
import re
import torch
from collections import Counter

SPECIAL_WORDS = {'PADDING': '<PAD>'}

def load_data(path):
    """
    Load Dataset from File
    """
    try:
        with open(path, "r", encoding='utf-8') as f:
            data = f.read()
    except FileNotFoundError:
        print(f"File not found: {path}")
        return None
    except IOError as e:
        print(f"Error reading file {path}: {e}")
        return None
    return data

def token_lookup():
    """
    Generate a dictionary to turn punctuation into a token.
    """
    return {
        '.': '||period||',
        ',': '||comma||',
        '"': '||quotation_mark||',
        ';': '||semicolon||',
        '!': '||exclamation_mark||',
        '?': '||question_mark||',
        '(': '||left_parenthesis||',
        ')': '||right_parenthesis||',
        '--': '||dash||',
        '\n': '||return||'
    }

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary.
    
    Args:
        text (list of str): The text data as a list of words.
    
    Returns:
        tuple: A tuple containing two dictionaries: vocab_to_int and int_to_vocab.
    """
    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {idx: word for idx, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: idx for idx, word in int_to_vocab.items()}
    
    return vocab_to_int, int_to_vocab

def preprocess_and_save_data(text, token_lookup, create_lookup_tables, output_file):
    """
    Preprocess Text Data and Save to File
    """
    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, f' {token} ')

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]

    with open(output_file, 'wb') as f:
        pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), f)

def load_preprocess(file_path):
    """
    Load the Preprocessed Training Data
    """
    try:
        with open(file_path, 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except IOError as e:
        print(f"Error reading file {file_path}: {e}")
        return None

# Define the path to your dataset
dataset_path = '/kaggle/input/cap111/cap.txt'

# Load the text data
text = load_data(dataset_path)


In [38]:
# Define the split ratio
validation_ratio = 0.1

# Split the text data
lines = text.split('\n')
split_index = int(len(lines) * (1 - validation_ratio))

train_lines = lines[:split_index]
val_lines = lines[split_index:]

# Join lines back into single strings
train_text = '\n'.join(train_lines)
val_text = '\n'.join(val_lines)


In [39]:
# Preprocess and save the training data
preprocess_and_save_data(train_text, token_lookup, create_lookup_tables, 'train_preprocess.p')

# Preprocess and save the validation data
preprocess_and_save_data(val_text, token_lookup, create_lookup_tables, 'val_preprocess.p')


In [40]:
# Load the preprocessed training data
int_train_text, vocab_to_int, int_to_vocab, token_dict = load_preprocess('train_preprocess.p')

# Load the preprocessed validation data
int_val_text, _, _, _ = load_preprocess('val_preprocess.p')


# Batch data 📦

In [41]:
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size):
    """
    Batch the neural network data using DataLoader
    """
    n_batches = len(words) // batch_size
    words = words[:n_batches * batch_size]
    
    x, y = [], []
    for idx in range(0, len(words) - sequence_length):
        x.append(words[idx:idx+sequence_length])
        y.append(words[idx+sequence_length])
    
    data = TensorDataset(torch.from_numpy(np.asarray(x)), torch.from_numpy(np.asarray(y)))
    data_loader = DataLoader(data, shuffle=True, batch_size=batch_size)
    return data_loader

# Parameters for batching
sequence_length = 10
batch_size = 128

# Create the dataloader
train_loader = batch_data(int_train_text, sequence_length, batch_size)
val_loader = batch_data(int_val_text, sequence_length, batch_size)

# Model Architecture 🧠

In [42]:
class RNN(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        """
        Initialize the PyTorch RNN Module
        """
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
        
        self.gru = nn.GRU(hidden_dim * 2, hidden_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_size)
        
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.output_size = output_size
    
    def forward(self, x, hidden):
        """
        Forward propagation of the neural network
        """
        h_lstm, c_lstm = hidden
        
        x = self.embedding(x)
        lstm_out, (h_lstm, c_lstm) = self.lstm1(x, (h_lstm, c_lstm))
        lstm_out, (h_lstm, c_lstm) = self.lstm2(lstm_out, (h_lstm, c_lstm))
        
        gru_out, h_gru = self.gru(lstm_out, h_lstm)
        
        gru_out = self.dropout(gru_out)
        gru_out = gru_out.contiguous().view(-1, self.hidden_dim * 2)
        out = self.fc(gru_out)
        out = out.view(x.size(0), -1, self.output_size)
        out = out[:, -1]
        return out, (h_gru, c_lstm)
    
    def init_hidden(self, batch_size):
        """
        Initialize the hidden state
        """
        weight = next(self.parameters()).data
        num_directions = 2  # Since the LSTM is bidirectional
        h0 = weight.new(self.n_layers * num_directions, batch_size, self.hidden_dim).zero_().to(device)
        c0 = weight.new(self.n_layers * num_directions, batch_size, self.hidden_dim).zero_().to(device)
        return (h0, c0)


# Train the data 🏋️‍

In [43]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden, clip=5):
    """
    Forward and backward propagation on the neural network
    """
    hidden = tuple([each.data for each in hidden])
    if torch.cuda.is_available():
        inp, target = inp.cuda(), target.cuda()
        
    rnn.zero_grad()
    output, hidden = rnn(inp, hidden)
    loss = criterion(output, target)
    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(), clip)
    optimizer.step()
    
    return loss.item(), hidden

def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()
    
    for epoch_i in range(1, n_epochs + 1):
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            if inputs.size(0) != batch_size:
                continue  # Skip the last batch if it's not full size
            
            hidden = tuple([each.data for each in hidden])
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)
            
            batch_losses.append(loss)
            
            if batch_i % show_every_n_batches == 0:
                avg_loss = np.mean(batch_losses)
                print(f'Epoch: {epoch_i}/{n_epochs}, Batch: {batch_i}, Loss: {avg_loss}')
                batch_losses = []
    return rnn


In [44]:
def save_model(file_path, model):
    torch.save(model.state_dict(), file_path)


In [None]:
# Hyperparameters
sequence_length = 10
batch_size = 128
num_epochs = 50
learning_rate = 0.001

vocab_size = len(vocab_to_int)
output_size = vocab_size
embedding_dim = 200
hidden_dim = 250
n_layers = 2
dropout = 0.5

train_on_gpu = torch.cuda.is_available()

rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout)
if train_on_gpu:
    rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs)

save_model('/kaggle/working/trained_rnn.pth', trained_rnn)
print('Model Trained and Saved')

Epoch: 1/50, Batch: 100, Loss: 6.526710863113403
Epoch: 1/50, Batch: 200, Loss: 5.9580798959732055
Epoch: 1/50, Batch: 300, Loss: 5.80269193649292
Epoch: 1/50, Batch: 400, Loss: 5.695820107460022
Epoch: 1/50, Batch: 500, Loss: 5.586495618820191
Epoch: 1/50, Batch: 600, Loss: 5.431292552947998
Epoch: 1/50, Batch: 700, Loss: 5.38152015209198
Epoch: 1/50, Batch: 800, Loss: 5.299539566040039
Epoch: 1/50, Batch: 900, Loss: 5.256376566886902
Epoch: 1/50, Batch: 1000, Loss: 5.200288867950439
Epoch: 1/50, Batch: 1100, Loss: 5.154879651069641
Epoch: 1/50, Batch: 1200, Loss: 5.132998042106628
Epoch: 1/50, Batch: 1300, Loss: 5.091118860244751
Epoch: 1/50, Batch: 1400, Loss: 5.080178213119507
Epoch: 1/50, Batch: 1500, Loss: 5.054933958053589
Epoch: 1/50, Batch: 1600, Loss: 5.0320326566696165
Epoch: 1/50, Batch: 1700, Loss: 5.005317163467407
Epoch: 1/50, Batch: 1800, Loss: 4.981086397171021
Epoch: 1/50, Batch: 1900, Loss: 4.926278576850891
Epoch: 2/50, Batch: 100, Loss: 4.819538390741939
Epoch: 2/5

# Generate text ✍️

In [50]:
import torch.nn.functional as F
import numpy as np

def load_model(file_path, model):
    model.load_state_dict(torch.load(file_path))
    model.eval()

def generate(rnn, prime_ids, int_to_vocab, token_dict, pad_value, predict_len=100):
    rnn.eval()
    current_seq = np.full((1, sequence_length), pad_value)
    prime_len = len(prime_ids)
    current_seq[0, -prime_len:] = prime_ids
    predicted = [int_to_vocab[idx] for idx in prime_ids]
    hidden = rnn.init_hidden(1)

    for _ in range(predict_len):
        current_seq = torch.LongTensor(current_seq).to(device)
        output, hidden = rnn(current_seq, hidden)
        p = F.softmax(output, dim=1).data
        if train_on_gpu:
            p = p.cpu()
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        word = int_to_vocab[word_i]
        predicted.append(word)
        current_seq = np.roll(current_seq.cpu(), -1, 1)
        current_seq[0, -1] = word_i

    gen_sentences = ' '.join(predicted)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    return gen_sentences

# Load the trained model
rnn_loaded = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout)
if train_on_gpu:
    rnn_loaded.cuda()

load_model('/kaggle/working/trained_rnn.pth', rnn_loaded)
print('Model Loaded')

# Generate text
gen_length = 1000
prime_phrase = 'i have to tell you something'

pad_word = SPECIAL_WORDS['PADDING']
prime_ids = [vocab_to_int[word] for word in prime_phrase.split() if word in vocab_to_int]
generated_script = generate(rnn_loaded, prime_ids, int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)


Model Loaded
i have to tell you something
, ” razumihin said, and was not in his
hand, as she had not seen that she is a minute,
and i am a word, you are not a man... i shall
be, but he is a minute of the same
woman, ” said raskolnikov, but she was a man of
the street and had been in the same man,
and a man was the room of a word,
and the man was in the room and the whole time.

“i am not a man, and i won’t be a good man, but you are not
very woman, ” he said and was a word, and she was a little man, and was in a minute.
but it is to be not in the same thing
of the whole moment... i won’t know,
but it is, you have not not seen. i am a man
, ” razumihin said to a man, but the
door, as he had been a minute of her
.

“i are a woman, i am a minute, and it was
the door, too......... i am very to be, i won’t
know. you? what is it, ” cried razumihin, and he was not
to the whole man, but it was a very woman of a
man.

“what, i am a minute, ” he said.

“i don’t know, i shall have a whole man,
i

## Evaluation of the model 📊

In [2]:
import os
import pickle
import re
import torch
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

SPECIAL_WORDS = {'PADDING': '<PAD>'}

# turn punctuation into a token
def token_lookup():
    return {
        '.': '||period||',
        ',': '||comma||',
        '"': '||quotation_mark||',
        ';': '||semicolon||',
        '!': '||exclamation_mark||',
        '?': '||question_mark||',
        '(': '||left_parenthesis||',
        ')': '||right_parenthesis||',
        '--': '||dash||',
        '\n': '||return||'
    }

def reverse_tokenize(text, token_dict):
    for key, token in token_dict.items():
        text = text.replace(token.lower(), key)
    return text

dataset_path = '/kaggle/input/crime-and-punishment/cap.txt'
dataset = load_data(dataset_path)

# Tokenization
token_dict = token_lookup()
for key, token in token_dict.items():
    dataset = dataset.replace(key, f' {token} ')

dataset = dataset.lower()

sentences = dataset.split(' ||period|| ') 

reference_texts = sentences[100:200]  

# Reverse tokenization for reference texts
reference_texts = [reverse_tokenize(ref, token_dict) for ref in reference_texts]

generated_script = generate(rnn_loaded, prime_ids, int_to_vocab, token_dict, vocab_to_int[SPECIAL_WORDS['PADDING']], gen_length)

# Reverse tokenization for generated text
generated_script = reverse_tokenize(generated_script, token_dict)

# Tokenize the generated text
generated_tokens = generated_script.split()

# Tokenize reference texts
reference_tokens = [ref.split() for ref in reference_texts]

# BLEU score
bleu_score = sentence_bleu(reference_tokens, generated_tokens)
print("BLEU Score:", bleu_score)

#ROUGE score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge_scores = scorer.score(' '.join(reference_texts), generated_script)
print("ROUGE-1 Score:", rouge_scores['rouge1'])
print("ROUGE-L Score:", rouge_scores['rougeL'])

KeyboardInterrupt: 