# Task 4

## Data extraction and preprocessing

In [24]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import re
import string
import requests
import time
import pickle

def download_text_file(url, save_path):
    response = requests.get(url)
    response.raise_for_status()
    with open(save_path, 'wb') as file:
        file.write(response.content)

def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

url = "https://www.gutenberg.org/files/2852/2852-0.txt"
save_path = "hound_of_the_baskervilles.txt"

def extract_main_text(text):
    start_pattern = "*** START OF THE PROJECT GUTENBERG EBOOK"
    end_pattern = "*** END OF THE PROJECT GUTENBERG EBOOK"
    
    start_idx = text.find(start_pattern)
    if start_idx != -1:
        start_idx = text.find("\n", start_idx) + 1
    else:
        start_idx = 0
        
    end_idx = text.find(end_pattern)
    if end_idx == -1:
        end_idx = len(text)
        
    return text[start_idx:end_idx].strip()

main_text = extract_main_text(text)

def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    keep_punct = ".,?!-'"
    text = ''.join(c if c.isalnum() or c.isspace() or c in keep_punct else ' ' for c in text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

processed_text = preprocess_text(main_text)

word_count = len(processed_text.split())
char_limit = len(processed_text)

if word_count > 20000:
    avg_word_length = len(processed_text) / word_count
    char_limit = int(20000 * avg_word_length)
    processed_text = processed_text[:char_limit]
    print(f"Text limited to ~20k words ({char_limit} characters)")
    print(f"New word count: {len(processed_text.split())}")

chars = sorted(list(set(processed_text)))
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}
vocab_size = len(chars)

seq_length = 100  
step = 5  

sequences = []
next_chars = []

for i in range(0, len(processed_text) - seq_length, step):
    sequences.append(processed_text[i:i + seq_length])
    next_chars.append(processed_text[i + seq_length])

print(f"Number of sequences: {len(sequences)}")

print("Vectorizing sequences...")
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=bool)
y = np.zeros((len(sequences), vocab_size), dtype=bool)

for i, sequence in enumerate(sequences):
    if i % 1000 == 0:
        print(f"Vectorizing sequence {i}/{len(sequences)}")
    for t, char in enumerate(sequence):
        X[i, t, char_to_idx[char]] = 1
    y[i, char_to_idx[next_chars[i]]] = 1

Text limited to ~20k words (105021 characters)
New word count: 19643
Number of sequences: 20985
Vectorizing sequences...
Vectorizing sequence 0/20985
Vectorizing sequence 1000/20985
Vectorizing sequence 2000/20985
Vectorizing sequence 3000/20985
Vectorizing sequence 4000/20985
Vectorizing sequence 5000/20985
Vectorizing sequence 6000/20985
Vectorizing sequence 7000/20985
Vectorizing sequence 8000/20985
Vectorizing sequence 9000/20985
Vectorizing sequence 10000/20985
Vectorizing sequence 11000/20985
Vectorizing sequence 12000/20985
Vectorizing sequence 13000/20985
Vectorizing sequence 14000/20985
Vectorizing sequence 15000/20985
Vectorizing sequence 16000/20985
Vectorizing sequence 17000/20985
Vectorizing sequence 18000/20985
Vectorizing sequence 19000/20985
Vectorizing sequence 20000/20985


## LSTM model training and text generation

In [25]:
normal_joe = Sequential([
    LSTM(128, input_shape=(seq_length, vocab_size), return_sequences=True),
    LSTM(128),
    Dense(vocab_size, activation='softmax')
])

normal_joe.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

normal_joe.summary()

print("Training LSTM model...")
start_time = time.time()
normal_joe_history = normal_joe.fit(
    X, y,
    batch_size=128,
    epochs=5,
    validation_split=0.1
)
end_time = time.time()
print(f"Training took {end_time - start_time:.2f} seconds")

normal_joe.save('normal_joe.h5')
print("Model saved as 'normal_joe.h5'")

def generate_text(model, seed_text, length=200, temperature=0.5):
    generated_text = seed_text
    current_text = preprocess_text(seed_text)
    
    print("Generating text...")
    for i in range(length):
        x_pred = np.zeros((1, seq_length, vocab_size))
        
        padded_text = current_text[-seq_length:].ljust(seq_length)
        if len(padded_text) < seq_length:
            padded_text = ' ' * (seq_length - len(padded_text)) + padded_text
            
        for t, char in enumerate(padded_text[-seq_length:]):
            if char in char_to_idx:
                x_pred[0, t, char_to_idx[char]] = 1
            else:
                x_pred[0, t, char_to_idx[' ']] = 1  
        
        preds = model.predict(x_pred, verbose=0)[0]
        
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds + 1e-10) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        
        next_index = np.random.choice(len(preds), p=preds)
        next_char = idx_to_char[next_index]
        
        generated_text += next_char
        current_text = current_text[1:] + next_char
        
        if i % 50 == 0 and i > 0:
            print(f"Generated {i} characters...")
    
    return generated_text

seed_texts = [
    "holmes looked at me with a smile",
    "the hound of the baskervilles",
    "watson, what do you make of this?"
]

temperatures = [0.2, 0.7, 1.2]

normal_joe_results = {}

for seed in seed_texts:
    normal_joe_results[seed] = {}
    print(f"\nSeed text: '{seed}'")
    
    for temp in temperatures:
        generated = generate_text(normal_joe, seed, length=300, temperature=temp)
        normal_joe_results[seed][temp] = generated
        print(f"\nTemperature: {temp}")
        print("-" * 80)
        print(generated)
        print("-" * 80)

with open('normal_joe.pkl', 'wb') as f:
    pickle.dump(normal_joe.history, f)
print("Training history saved")

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_24 (LSTM)              (None, 100, 128)          88576     
                                                                 
 lstm_25 (LSTM)              (None, 128)               131584    
                                                                 
 dense_13 (Dense)            (None, 44)                5676      
                                                                 
Total params: 225,836
Trainable params: 225,836
Non-trainable params: 0
_________________________________________________________________
Training LSTM model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training took 3449.40 seconds
Model saved as 'normal_joe.h5'

Seed text: 'holmes looked at me with a smile'
Generating text...
Generated 50 characters...
Generated 100 characters...
Generated 150 characters...
Generated 200 characters...


In [31]:
with open('normal_joe.pkl', 'wb') as f:
    pickle.dump(normal_joe_history, f)
print("Training history saved")

Training history saved


# Task 5

## Bidirectional model training and text generation

In [20]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam

bi_model = Sequential([
    Bidirectional(LSTM(128, return_sequences=True), input_shape=(seq_length, vocab_size)),
    Bidirectional(LSTM(128)),
    Dense(vocab_size, activation='softmax')
])

bi_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

bi_model.summary()

start_time = time.time()
bi_history = bi_model.fit(
    X, y,
    batch_size=128,
    epochs=5,
    validation_split=0.1
)
end_time = time.time()
print(f"Training took {end_time - start_time:.2f} seconds")

bi_model.save('bi_lstm_model.h5')
print("Model saved as 'bi_lstm_model.h5'")

def generate_text(model, seed_text, length=200, temperature=0.5):
    generated_text = seed_text
    current_text = preprocess_text(seed_text)
    
    print("Generating text...")
    for i in range(length):
        x_pred = np.zeros((1, seq_length, vocab_size))
        padded_text = current_text[-seq_length:].ljust(seq_length)
        if len(padded_text) < seq_length:
            padded_text = ' ' * (seq_length - len(padded_text)) + padded_text
            
        for t, char in enumerate(padded_text[-seq_length:]):
            if char in char_to_idx:
                x_pred[0, t, char_to_idx[char]] = 1
            else:
                x_pred[0, t, char_to_idx[' ']] = 1
        
        preds = model.predict(x_pred, verbose=0)[0]
        
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds + 1e-10) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        
        next_index = np.random.choice(len(preds), p=preds)
        next_char = idx_to_char[next_index]
        
        generated_text += next_char
        current_text = current_text[1:] + next_char
        
        if i % 50 == 0 and i > 0:
            print(f"Generated {i} characters...")
    
    return generated_text

seed_texts = [
    "holmes looked at me with a smile",
    "the hound of the baskervilles",
    "watson, what do you make of this?"
]

temperatures = [0.2, 0.7, 1.2]

bi_results = {}

for seed in seed_texts:
    bi_results[seed] = {}
    print(f"\nSeed text: '{seed}'")
    
    for temp in temperatures:
        generated = generate_text(bi_model, seed, length=300, temperature=temp)
        bi_results[seed][temp] = generated
        print(f"\nTemperature: {temp}")
        print("-" * 80)
        print(generated)
        print("-" * 80)

import pickle
with open('bi_history.pkl', 'wb') as f:
    pickle.dump(bi_history.history, f)
print("Training history saved")

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirectio  (None, 100, 256)         177152    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 256)              394240    
 nal)                                                            
                                                                 
 dense_12 (Dense)            (None, 44)                11308     
                                                                 
Total params: 582,700
Trainable params: 582,700
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training took 14409.64 seconds
Model saved as 'bi_lstm_model.h5'

Seed text: 'holmes looked at me with a smile'
Generating text..