In [None]:
import sys, re
import numpy as np
import math

###############################################################################

def preprocess(s):
    """Tokenise a line"""
    o = re.sub('([^a-zA-Z0-9\']+)', ' \g<1> ', s.strip())
    return ['<BOS>'] + re.sub('  *', ' ', o).strip().split(' ')

###############################################################################

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

EMBEDDING_DIM = 4
CONTEXT_SIZE = 1 #!!!#
HIDDEN_DIM = 6

# Bigram Neural Network Model
class BigramNNmodel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(BigramNNmodel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of x1 and x2 embeddings
        embeds = self.embeddings(inputs).view(
                (-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = F.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

In [None]:
import sys, re
import numpy as np
import math


###############################################################################

training_samples = []
vocabulary = set(['<UNK>'])
file1 = open('train.txt', 'r')
lines = file1.readlines()
#lines = 
print(lines)
for line in lines:
    tokens = preprocess(line)
    for i in tokens: vocabulary.add(i) 
    training_samples.append(tokens)
    print(line)
    #line = sys.stdin.readline()

word2idx = {k: v for v, k in enumerate(vocabulary)}
idx2word = {v: k for k, v in word2idx.items()}

x_train = []
y_train = []
for tokens in training_samples:
    for i in range(len(tokens) - 1): #!!!#
        x_train.append([word2idx[tokens[i]]]) #!!!#
        y_train.append([word2idx[tokens[i+1]]]) #!!!#

print("x_train",x_train)
x_train = np.array(x_train)
y_train = np.array(y_train)

###############################################################################

BATCH_SIZE = 1
NUM_EPOCHS = 10

train_set = np.concatenate((x_train, y_train), axis=1)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE)

loss_function = nn.NLLLoss()
model = BigramNNmodel(len(vocabulary), EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM)
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(NUM_EPOCHS):
    for i, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:1] #!!!#
        target_tensor = data_tensor[:,1] #!!!#

        model.zero_grad()

        log_probs = model(context_tensor)
        loss = loss_function(log_probs, target_tensor)

        loss.backward()
        optimiser.step()    

    print('Epoch:', epoch, 'loss:', float(loss))

torch.save({'model': model.state_dict(), 'vocab': idx2word}, 'model.lm')

print('Model saved.')

['are you still here?\n', 'where are you?\n', 'are you tired?\n', 'i am tired.\n', 'are you in england?\n', 'were you in mexico?\n']
are you still here?

where are you?

are you tired?

i am tired.

are you in england?

were you in mexico?

x_train [[0], [11], [12], [1], [7], [0], [8], [11], [12], [0], [11], [12], [15], [0], [9], [5], [15], [0], [11], [12], [14], [3], [0], [10], [12], [14], [4]]
Epoch: 0 loss: 2.4375405311584473
Epoch: 1 loss: 2.2202789783477783
Epoch: 2 loss: 1.9671860933303833
Epoch: 3 loss: 1.6362926959991455
Epoch: 4 loss: 1.2517435550689697
Epoch: 5 loss: 0.9026921987533569
Epoch: 6 loss: 0.656467080116272
Epoch: 7 loss: 0.5051158666610718
Epoch: 8 loss: 0.4132114350795746
Epoch: 9 loss: 0.3550700843334198
Model saved.


In [None]:
import sys, re
import numpy as np
import math


###############################################################################

blob = torch.load('model.lm')
idx2word = blob['vocab']
word2idx = {k: v for v, k in idx2word.items()}
vocabulary = set(idx2word.values())

model = BigramNNmodel(len(vocabulary), EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM)
model.load_state_dict(blob['model'])

###############################################################################

BATCH_SIZE = 1

file1 = open('test.txt', 'r')
lines = file1.readlines()
#lines = sys.stdin.readline()
for line in lines:
    tokens = preprocess(line)
    
    x_test = []
    y_test = []
    for i in range(len(tokens) - 1): #!!!#
        x_test.append([word2idx[tokens[i]]]) #!!!#
        y_test.append([word2idx[tokens[i+1]]]) #!!!#
    
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    
    test_set = np.concatenate((x_test, y_test), axis=1)
    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)
    
    total_prob = 1.0
    for i, data_tensor in enumerate(test_loader):
        context_tensor = data_tensor[:,0:1] #!!!#
        target_tensor = data_tensor[:,1] #!!!#
        log_probs = model(context_tensor)
        probs = torch.exp(log_probs)
        predicted_label = int(torch.argmax(probs, dim=1)[0])
    
        true_label = y_test[i][0]
        true_word = idx2word[true_label]
    
        prob_true = float(probs[0][true_label])
        total_prob *= prob_true
    
    print('%.6f\t%.6f\t' % (total_prob, math.log(total_prob)), tokens)
    
    #line = sys.stdin.readline()

0.006567	-5.025658	 ['<BOS>', 'where', 'are', 'you', '?']
0.002403	-6.030873	 ['<BOS>', 'were', 'you', 'in', 'england', '?']
0.026050	-3.647736	 ['<BOS>', 'are', 'you', 'in', 'mexico', '?']
0.000067	-9.614363	 ['<BOS>', 'i', 'am', 'in', 'mexico', '.']
0.000672	-7.304865	 ['<BOS>', 'are', 'you', 'still', 'in', 'mexico', '?']
