In [1]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Utiliser Fast Text pour le carractere embedding

In [2]:
################# Prepare data #################
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[seq]]
    return torch.tensor(idxs, dtype=torch.long)


data_file = open('pos_reference.txt.lima', 'r')
training_data = []
word_to_ix = {}
tag_to_ix = {}

for line in data_file : 
    # Pour ne pas prendre les lignes vides
    if not line.isspace():

        # On retire les retours chariots à la fin des lignes + separation du mot et de sa forme morpho-syntaxique dans un tuple
        wordAndToken = line.rstrip('\n').split('\t')

        # Si le mot n'est pas dans le dico on l'ajoute avec son index qui est la taille actuelle du dico
        if wordAndToken[0] not in word_to_ix:
            word_to_ix[wordAndToken[0]] = len(word_to_ix)

        # Pareil pour les tokens
        if wordAndToken[1] not in tag_to_ix:
            tag_to_ix[wordAndToken[1]] = len(tag_to_ix)

        training_data.append(wordAndToken)

In [3]:
print(tag_to_ix)

{'PROPN': 0, 'COMMA': 1, 'ADJ': 2, 'AUX': 3, 'VERB': 4, 'DET': 5, 'NOUN': 6, 'ADP': 7, 'SENT': 8, 'CONJ': 9, 'ADV': 10, 'PART': 11, 'PRON': 12, 'SCONJ': 13, 'COLON': 14, 'OQU': 15, 'QUOT': 16, 'NUM': 17, 'SYM': 18, 'OPAR': 19, 'CPAR': 20}


In [4]:
#print(word_to_ix)

In [5]:
for i in range(10):
    print(training_data[i])

['Pierre Vinken', 'PROPN']
[',', 'COMMA']
['61 years old', 'ADJ']
[',', 'COMMA']
['will', 'AUX']
['join', 'VERB']
['the', 'DET']
['board', 'NOUN']
['as', 'ADP']
['a', 'DET']


In [6]:
################# Create the model #################
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores



In [7]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

################# Train the model #################
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [8]:
training_data[0]

['Pierre Vinken', 'PROPN']

In [9]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    print(inputs)
    tag_scores = model(inputs)
    print()
    print("=> Scores before training of the tags affected to each word")
    print(tag_scores)

tensor([0])

=> Scores before training of the tags affected to each word
tensor([[-2.9834, -3.1325, -3.3245, -2.7876, -3.4113, -2.9564, -3.2155, -2.9738,
         -3.3506, -2.9295, -3.0804, -2.8769, -2.8225, -2.7926, -2.9356, -3.0541,
         -3.4316, -3.0646, -3.2359, -2.7259, -3.3126]])


In [10]:
for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    print("Epoch : " + str(epoch) + '/300' + '\n')
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()



Epoch : 0/300

Epoch : 1/300

Epoch : 2/300

Epoch : 3/300

Epoch : 4/300

Epoch : 5/300

Epoch : 6/300

Epoch : 7/300

Epoch : 8/300

Epoch : 9/300

Epoch : 10/300

Epoch : 11/300

Epoch : 12/300

Epoch : 13/300

Epoch : 14/300

Epoch : 15/300

Epoch : 16/300

Epoch : 17/300

Epoch : 18/300

Epoch : 19/300

Epoch : 20/300

Epoch : 21/300

Epoch : 22/300

Epoch : 23/300

Epoch : 24/300

Epoch : 25/300

Epoch : 26/300

Epoch : 27/300

Epoch : 28/300

Epoch : 29/300

Epoch : 30/300

Epoch : 31/300

Epoch : 32/300

Epoch : 33/300

Epoch : 34/300

Epoch : 35/300

Epoch : 36/300

Epoch : 37/300

Epoch : 38/300

Epoch : 39/300

Epoch : 40/300

Epoch : 41/300

Epoch : 42/300

Epoch : 43/300

Epoch : 44/300

Epoch : 45/300

Epoch : 46/300

Epoch : 47/300

Epoch : 48/300

Epoch : 49/300

Epoch : 50/300

Epoch : 51/300

Epoch : 52/300

Epoch : 53/300

Epoch : 54/300

Epoch : 55/300

Epoch : 56/300

Epoch : 57/300

Epoch : 58/300

Epoch : 59/300

Epoch : 60/300

Epoch : 61/300

Epoch : 62/300

Ep

In [14]:
print(tag_to_ix)

{'PROPN': 0, 'COMMA': 1, 'ADJ': 2, 'AUX': 3, 'VERB': 4, 'DET': 5, 'NOUN': 6, 'ADP': 7, 'SENT': 8, 'CONJ': 9, 'ADV': 10, 'PART': 11, 'PRON': 12, 'SCONJ': 13, 'COLON': 14, 'OQU': 15, 'QUOT': 16, 'NUM': 17, 'SYM': 18, 'OPAR': 19, 'CPAR': 20}


In [21]:
# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[4][0], word_to_ix)
    print()
    print('=> The sentence to analyze (first sentence of the Training data):')
    print(training_data[4])

    print()
    print("=> Training data: each word is assigned to a unique index:")
    #print(word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!

    print()
    print("=> Scores after training of the tags affected to each word of the sentence to analyze:")
    print(tag_scores)


=> The sentence to analyze (first sentence of the Training data):
['will', 'AUX']

=> Training data: each word is assigned to a unique index:

=> Scores after training of the tags affected to each word of the sentence to analyze:
tensor([[ -2.0369, -42.5576,  -7.7420,  -0.9809, -17.8532, -20.3323,  -3.7612,
         -10.9029,  -4.9504,  -1.4733,  -6.8486, -10.4121,  -1.7218,  -7.9331,
          -7.1646,  -3.9962, -27.9692,  -3.3447, -17.3662, -10.8755, -11.9297]])


In [11]:
#Dans le tensor ci-dessus, le score le plus haut correspond au resultat le plus probable.