In [8]:
# Import Python libraries and helper functions (in utils2) 
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter
from utils2 import sigmoid, get_batches, compute_pca, get_dict
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [9]:
# Download sentence tokenizer
nltk.data.path.append('.')

In [10]:
# Load, tokenize and process the data
import re                                                           #  Load the Regex-modul
with open('shakespeare.txt') as f:
    data = f.read()                                                 #  Read in the data
data = re.sub(r'[,!?;-]', '.',data)                                 #  Punktuations are replaced by .
data = nltk.word_tokenize(data)                                     #  Tokenize string to words
data = [ ch.lower() for ch in data if ch.isalpha() or ch == '.']    #  Lower case and drop non-alphabetical tokens
print("Number of tokens:", len(data),'\n', data[:15])               #  print data sample

Number of tokens: 60975 
 ['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend', 'the', 'brightest', 'heaven', 'of', 'invention']


In [11]:
# get_dict creates two dictionaries, converting words to indices and viceversa.
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  5777


In [12]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(5777,50)
        self.fc2 = nn.Linear(50,5777)
        
    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = F.relu(self.fc1(x))
        x = F.log_softmax(self.fc2(x))
        return x
net = Net()
print(net) 

Net(
  (fc1): Linear(in_features=5777, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=5777, bias=True)
)


In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.003, momentum=0.9)

In [14]:
batch_size = 1
iters = 0
num_iters = 300
C = 2
for x, y in get_batches(data, word2Ind, V, C, batch_size):
    optimizer.zero_grad()
    x = (torch.from_numpy(x)).float()
    y = (torch.from_numpy(y)).float()
    x = torch.transpose(x,0,1)
    y = torch.transpose(y,0,1)
    y = torch.argmax(y, dim=1)
    outputs = net(x)
    loss = criterion(outputs, y)
    if ((iters+1) % 10 == 0):
        print(f"iters: {iters + 1} cost: {loss:.6f}")
    loss.backward()
    optimizer.step()
        
    iters += 1
    if iters == num_iters:
        break

  # Remove the CWD from sys.path while we load stuff.


iters: 10 cost: 8.492229
iters: 20 cost: 8.242281
iters: 30 cost: 7.938903
iters: 40 cost: 7.603908
iters: 50 cost: 7.234701
iters: 60 cost: 6.813603
iters: 70 cost: 6.306328
iters: 80 cost: 5.653570
iters: 90 cost: 4.756248
iters: 100 cost: 3.461766
iters: 110 cost: 1.673826
iters: 120 cost: 0.310009
iters: 130 cost: 0.056559
iters: 140 cost: 0.023047
iters: 150 cost: 0.015124
iters: 160 cost: 0.012221
iters: 170 cost: 0.010759
iters: 180 cost: 0.009818
iters: 190 cost: 0.009107
iters: 200 cost: 0.008522
iters: 210 cost: 0.008018
iters: 220 cost: 0.007569
iters: 230 cost: 0.007173
iters: 240 cost: 0.006812
iters: 250 cost: 0.006487
iters: 260 cost: 0.006193
iters: 270 cost: 0.005919
iters: 280 cost: 0.005672
iters: 290 cost: 0.005442
iters: 300 cost: 0.005230
