In [1]:
import torch
import torch.nn as optim
from torch import nn

from torch.utils.data import Dataset, DataLoader
from collections import Counter

In [2]:
# Dataset Preparation
with open('business_2.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    
# Tokenize text into word
words = text.split()
word_counts = Counter(words)

vocab = list(word_counts.keys())
vocab_size = len(vocab)
word_to_int = {word: i for i, word in enumerate(vocab)}
int_to_word = {i: word for word, i in word_to_int.items()}

SEQUENCE_LENGTH = 64
samples = [words[i:i+SEQUENCE_LENGTH+1] for i in range(len(words)-SEQUENCE_LENGTH)]

print(vocab)
print(word_to_int)
print(int_to_word)

['Japanese', 'growth', 'grinds', 'to', 'a', 'halt', 'Growth', 'in', 'Japan', 'evaporated', 'the', 'three', 'months', 'September,', 'sparking', 'renewed', 'concern', 'about', 'an', 'economy', 'not', 'long', 'out', 'of', 'decade-long', 'trough.', 'Output', 'period', 'grew', 'just', '0.1%,', 'annual', 'rate', '0.3%.', 'Exports', '-', 'usual', 'engine', 'recovery', 'faltered,', 'while', 'domestic', 'demand', 'stayed', 'subdued', 'and', 'corporate', 'investment', 'also', 'fell', 'short.', 'The', 'falls', 'well', 'short', 'expectations,', 'but', 'does', 'mark', 'sixth', 'straight', 'quarter', 'expansion.', 'had', 'stagnated', 'throughout', '1990s,', 'experiencing', 'only', 'brief', 'spurts', 'expansion', 'amid', 'periods', 'doldrums.', 'One', 'result', 'was', 'deflation', 'prices', 'falling', 'rather', 'than', 'rising', 'which', 'made', 'shoppers', 'cautious', 'kept', 'them', 'from', 'spending.', 'effect', 'leave', 'more', 'dependent', 'ever', 'on', 'exports', 'for', 'its', 'recent', 'recove

Creating Data Loaders

In [3]:
class TextDataset(Dataset):
    def __init__(self, samples, word_to_int):
        self.samples = samples
        self.word_to_int = word_to_int
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        input_seq = torch.LongTensor([self.word_to_int[word] for word in sample[:-1]])
        target_seq = torch.LongTensor([self.word_to_int[word] for word in sample[1:]])
        return input_seq, target_seq

In [4]:
BATCH_SIZE = 32
dataset = TextDataset(samples, word_to_int)
dataloader = DataLoader(
                        dataset,
                        batch_size = BATCH_SIZE,
                        shuffle = True)

print(dataset[1])

(tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9,  7, 10, 11, 12,  3, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23,  4, 24, 25, 26,  7, 10, 27, 28, 29, 30, 18,
        31, 32, 23, 33, 34, 35, 10, 36, 37, 23, 38, 35, 39, 40, 41, 42, 43, 44,
        45, 46, 47, 48, 49, 50, 51,  1, 52, 53]), tensor([ 2,  3,  4,  5,  6,  7,  8,  9,  7, 10, 11, 12,  3, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23,  4, 24, 25, 26,  7, 10, 27, 28, 29, 30, 18, 31,
        32, 23, 33, 34, 35, 10, 36, 37, 23, 38, 35, 39, 40, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51,  1, 52, 53, 54]))


LSTM Model

In [5]:
class TextGenerationLSTM(nn.Module):
    def __init__(self, vocab_size, emdedding_dim, hidden_size, num_layers):
        super(TextGenerationLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size = emdedding_dim,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
    def forward(self, x, hidden=None):
        if hidden == None:
            hidden = self.init_hidden(x.shape[0])
        x = self.emdedding(x)
        out, (h_n, c_n) = self.lstm(x, hidden)
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.fc(out)
        return out, (h_n, c_n)
    
    def init_hidden(self, batch_size):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return h0, c0

Training HyperParameters

In [6]:
# Training Setup
embedding_dim = 16
hidden_size = 32
num_layers = 1
learning_rate = 0.01
epochs = 50

Training the LSTM Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextGenerationLSTM(vocab_size,
                          embedding_dim,
                          hidden_size,
                          num_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameter(), lr=learning_rate)