In [1]:
%matplotlib inline

# Neural ngram language model (naive implementation)

We will implement a language model
- at a given position i
- it takes as input the two preceding words
- and outputs log_probabilities for each word of the vocabulary

The network will use the sum of embeddings of the two preceding words, followed by a MLP with a single hidden layer.

In [2]:
# Inspired by
# voir https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
# by Robert Guthrie


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x27570d651d0>

In [3]:
# Tiny training corpus: we will use a poem by Andrée Chedid
# (To tokenize it just split on spaces)
train_sent = """
Mon autre , mon semblable ,
en cette chair qui se démène ,
en ce sang qui cavalcade ,
en ce complot du temps ,
en cette mort qui nous guette ,
en cette fraternité de nos fugaces vies ,
mon semblable , mon autre ,
là où tu es je suis .
Le hasard ne cesse de ramener vers nos rivages 
quelques merveilles que nous n' avions pas cueillies ,
quelques malheurs que nous n' avions pas ourdis .
Surgi des ténèbres ou de l' éclair ,
le hasard pose tantôt son aile sur notre épaule ,
tantôt ses griffes dans la chair de nos vies .
""".split()

# Build list of examples:  each example is ([ word_i-2, word_i-1 ], target word)
train_examples = [( [ train_sent[i], train_sent[i+1] ], train_sent[i+2] ) 
                  for i in range(len(train_sent) - 2)]

# print the first 3, just so you can see what they look like
print(train_examples[:3])

vocab = set(train_sent)
w2i = {word: i for i, word in enumerate(vocab)}

# encode the training examples into word ids
train_examples =  [ ( [ w2i[ex[0][0]], w2i[ex[0][1]] ], w2i[ex[1]] ) for ex in train_examples ]
print(train_examples[:3])



[(['Mon', 'autre'], ','), (['autre', ','], 'mon'), ([',', 'mon'], 'semblable')]
[([18, 17], 25), ([17, 25], 3), ([25, 3], 50)]


In [4]:
class NGramLanguageModel(nn.Module):

    def __init__(self, vocab_size, embedding_size, context_size, hidden_layer_size):
        super(NGramLanguageModel, self).__init__()
        # the target network should work as follows:
        #  for a single input
        # - input =  context_size word ids (context words)
        #    => retrieval of their embeddings
        #    => concatenation (see the forward method below)
        # - which provides the "embedding layer"
        #    => linear combination
        #    => reLU
        # - which provides the "hidden layer"
        #    => linear combination
        #    => log_softmax
        # - which provides log probabilities over the full vocabulary
        
        # but remember nn.Module works with a batch of inputs, not a single input
        
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        
        self.linear_1 = nn.Linear(context_size*embedding_size, hidden_layer_size) 
        self.linear_2 = nn.Linear(hidden_layer_size, vocab_size)

    def forward(self, inputs):
        size = inputs.size()              # [BATCH_SIZE, CONTEXT_SIZE]
        embeds = self.embeddings(inputs)  # [BATCH_SIZE, CONTEXT_SIZE, EMB_SIZE]
        embeds = embeds.view(size[0], -1) # [BATCH_SIZE, CONTEXT_SIZE * EMB_SIZE] : concatenation of the 2 embeddings

        # TODO: continue the forward propagation
        #  (cf. description above of what the network should do)
        #  to output log probabilities,
        #  writing the shape of each additional tensor (as done above)
        
        out = self.linear_1(embeds) #[BATCH_SIZE,HIDDEN_LAYER_SIZE]
        out = torch.relu(out) #[BATCH_SIZE,HIDDEN_LAYER_SIZE]
        out = self.linear_2(out) #[BATCH_SIZE,VOCAB_SIZE]

        log_probs = F.log_softmax(out, dim=1) #[BATCH_SIZE,VOCAB_SIZE]
        
        return log_probs

    



In [5]:
CONTEXT_SIZE = 2
EMB_SIZE = 10
HIDDEN_LAYER_SIZE = 128

# instance of NGramLanguageModel
my_language_model = NGramLanguageModel(len(vocab), EMB_SIZE, CONTEXT_SIZE, HIDDEN_LAYER_SIZE)


In [6]:

# to store the training losses at each epoch
train_losses = []
loss_function = nn.NLLLoss()

# the optimizer is the instance that will actually update the declared parameters
optimizer = optim.SGD(my_language_model.parameters(), lr=0.05)

from random import shuffle

NB_EPOCHS = 20
BATCH_SIZE = 5

for epoch in range(NB_EPOCHS):
    epoch_loss = 0
    
    # shuffle data
    shuffle(train_examples)
    i = 0
    while i < len(train_examples):
        
        batch = train_examples[i: i+BATCH_SIZE]
        i += BATCH_SIZE
        
        contexts, targets = zip(*batch)

        # Step 1. Prepare the inputs to be passed to the model 
        input_tensor = torch.tensor(contexts, dtype=torch.long) # [BATCH_SIZE, CONTEXT_SIZE]
        gold_labels = torch.tensor(targets, dtype=torch.long)   # [BATCH_SIZE]
        
        #TODO
        my_language_model.zero_grad() #pourquoi fait-on ça ?
        log_probs = my_language_model(input_tensor) #[BATCH_SIZE,VOCAB_SIZE]
        
        loss = loss_function(log_probs, gold_labels) 
        epoch_loss += loss.item()
        loss.backward()
        
        optimizer.step()
        #FIN TODO
    # end of handling of this batch
    
    train_losses.append(epoch_loss)
print(train_losses)


4.151449203491211
8.390563011169434
12.590409278869629
16.662914276123047
20.791609287261963
25.066925525665283
29.323373317718506
33.68824815750122
37.89822769165039
42.29113149642944
46.42962312698364
50.497735023498535
54.38040065765381
58.41500520706177
62.31788682937622
66.70347595214844
71.05141592025757
75.33576583862305
79.50575065612793
83.73987197875977
87.77950191497803
92.2821536064148
3.9292654991149902
7.846031904220581
11.40357518196106
15.167227506637573
19.04224157333374
23.0201678276062
26.909029006958008
30.562878131866455
34.37788772583008
38.32944869995117
42.26494264602661
46.02215886116028
50.152562856674194
54.432560205459595
57.987597942352295
61.844754219055176
65.2795717716217
69.10372757911682
73.07127261161804
77.23184180259705
81.30674242973328
85.92892956733704
3.4605212211608887
6.974862575531006
10.878645658493042
14.634712219238281
17.876873016357422
21.528829097747803
25.284932613372803
28.944629669189453
32.733460664749146
36.374852418899536
40.07067

19.162797927856445
23.078232526779175
26.25992202758789
30.246090412139893
33.28753590583801
36.93803644180298
40.37679982185364
44.00479865074158
47.51496481895447
50.427765130996704
54.10487699508667
57.12611627578735
60.251197814941406
63.33987474441528
66.60127830505371
70.17320895195007
74.27564835548401
2.901876926422119
6.581538438796997
9.544856786727905
12.72333836555481
15.84523057937622
18.854832649230957
21.917694568634033
25.199500799179077
28.460869312286377
30.85666298866272
33.55369710922241
36.25322699546814
38.934126138687134
41.810322523117065
45.059988021850586
48.24303197860718
51.056434631347656
54.77275466918945
57.96006751060486
61.24719524383545
65.30588150024414
68.69584894180298
3.064382553100586
6.0829808712005615
8.619710206985474
10.826017141342163
13.590502262115479
15.840856552124023
19.069714069366455
21.84925866127014
24.092647552490234
27.806498050689697
31.590261220932007
34.73227095603943
37.869872093200684
40.52084970474243
43.30073690414429
46.307

### To go further

- display the learning curve on the train set
- use of a dev set to tune the number of epochs using early stopping 
- handling of unknown words
- ...