In [22]:
#Activity 6.02

import math
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn, optim
import torch.nn.functional as F

with open('alice.txt', 'r', encoding='latin1') as f:
    data = f.read()
    
#print("Extract: ", data[:50])
print("Length: ", len(data))

chars = list(set(data))
indexer = {char : index for (index, char) in enumerate(chars)}
#indexer

indexed_data = []
for c in data:
    indexed_data.append(indexer[c])
    
#print("Indexed extract: ", indexed_data[:50])
#print("Length: ", len(indexed_data))

def index2onehot(batch):
    batch_flatten = batch.flatten()
    onehot_flat = np.zeros((batch.shape[0]*batch.shape[1], len(indexer)))
    onehot_flat[range(len(batch_flatten)), batch_flatten] = 1
    onehot = onehot_flat.reshape((batch.shape[0], batch.shape[1],-1))
    return onehot

class LSTM(nn.Module):
    def __init__(self, char_length, hidden_size, n_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.lstm = nn.LSTM(char_length, hidden_size, n_layers, batch_first=True)
    
        self.output = nn.Linear(hidden_size, char_length)

    def forward(self, x, states):
        out, states = self.lstm(x, states)
        out = out.contiguous().view(-1,self.hidden_size)
        out = self.output(out)
        
        return out, states
    
    def init_states(self, batch_size):
        hidden = next(self.parameters()).data.new(self.n_layers, batch_size, self.hidden_size).zero_()
        cell = next(self.parameters()).data.new(self.n_layers, batch_size, self.hidden_size).zero_()
        states = (hidden, cell)
        
        return states

#Number of sequences per batch
n_seq = 100
seq_length = 50
n_batches = math.floor(len(indexed_data)/n_seq/seq_length)

total_length = n_seq*seq_length*n_batches
x = indexed_data[:total_length]
x = np.array(x).reshape((n_seq, -1))

#checking the model
model = LSTM(len(chars), 256, 2).to("cuda")
#model

#some hyperparameters
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 500
losses=[]

#Training
for e in range(1, epochs+1):
    states = model.init_states(n_seq)
    batch_loss = []
    
    for b in range(0, x.shape[1], seq_length):
        x_batch = x[:, b:b+seq_length]
        
        #Check if it is last batch
        if b==x.shape[1]-seq_length:
            y_batch = x[:, b+1:b+seq_length]
            y_batch = np.hstack((y_batch, indexer["."]*np.ones((y_batch.shape[0],1))))
        else:
            y_batch = x[:, b+1:b+seq_length+1]
        
        x_onehot = torch.Tensor(index2onehot(x_batch)).to("cuda")
        y = torch.Tensor(y_batch).view(n_seq*seq_length).to("cuda")
        
        pred, states = model(x_onehot, states)
        torch.autograd.set_detect_anomaly(True)
        loss = loss_function(pred, y.long())
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
        batch_loss.append(loss.item())
        
    losses.append(np.mean(batch_loss))
    
    if e%50 == 0:
        print("epoch: ", e, "... Loss function: ", losses[-1])
        
        

x_range = range(len(losses))
plt.plot(x_range, losses)
plt.xlabel("epochs")
plt.ylabel("Loss function")
plt.show()

#Testing
starter = "So she was considering in her own mind "
states = None
model = model.to("cpu")
for ch in starter:
    x = np.array([[indexer[ch]]])
    x = index2onehot(x)
    x = torch.Tensor(x)
    
    pred, states = model(x, states)

counter = 0
while starter[-1] != "." and counter < 100:
    counter += 1
    x = np.array([[indexer[starter[-1]]]])
    x = index2onehot(x)
    x = torch.Tensor(x)
    
    pred, states = model(x, states)
    pred = F.softmax(pred, dim=1)
    p, top = pred.topk(10)
    p = p.detach().numpy()[0]
    top = top.numpy()[0]
    index = np.random.choice(top, p=p/p.sum())
    
    starter += chars[index]
print(starter)

Length:  145180


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1024, 72]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Extract:  ALICE was beginning to get very tired of sitting b
Length:  145180
Indexed extract:  [52, 15, 16, 53, 61, 32, 3, 70, 38, 32, 71, 58, 67, 23, 7, 7, 23, 7, 67, 32, 8, 63, 32, 67, 58, 8, 32, 31, 58, 17, 10, 32, 8, 23, 17, 58, 50, 32, 63, 49, 32, 38, 23, 8, 8, 23, 7, 67, 32, 71]
Length:  145180
0


RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cpu and hidden tensor at cuda:0