In [3]:
import collections
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd /content/drive/MyDrive/projet\ pstaln/Complex-Word_identification-main/src

/content/drive/MyDrive/projet pstaln/Complex-Word_identification-main/src


In [6]:
from util import read_data

train_data_file = '../data/cwi_training/cwi_training.txt'
test_data_file = '../data/cwi_testing_annotated/cwi_testing_annotated.txt'
output_file = '../output/test.txt'

train_sentences, train_words, train_label = read_data(train_data_file)
test_sentences, test_words, test_label = read_data(test_data_file)

## On concaténe les train_sentences et les test_sentences dans une seule liste

In [7]:
sentences = train_sentences + test_sentences

sentences_labels = train_label + test_label

In [10]:
len(train_sentences) ,len(test_sentences)

(2237, 88221)

Ici, on va utiliser que des phrases différents.

Si on a quatres phrases avec leurs labels successives :   
['il','joue','avec','ses',amis'] ----> [0,1,0,0,0]

['il','joue','avec','ses',amis'] ----> [0,0,0,0,1]

['je','vais','faire','les','courses'] ----> [0,1,0,0,0]

['je','vais','faire','les','courses'] ----> [0,0,0,0,1]

On va garder que deux phrases avec leurs labels :

['il','joue','avec','ses',amis'] ----> [0,1,0,0,1]

['je','vais','faire','les','courses'] ----> [0,1,0,0,1]

In [11]:
list_of_sentences = [sent.split() for sent in sentences ]

texts = []
labels = []
new_label = [0 for k in list_of_sentences[0]]

for i in range(0,len(list_of_sentences)-1) :

  for j in range(len(list_of_sentences[i])) :
    if sentences_labels[i][j] == 1 :
      new_label[j] = 1
      
  if list_of_sentences[i] != list_of_sentences[i+1] :
    texts.append(list_of_sentences[i])
    labels.append(new_label)
    new_label = [0 for k in list_of_sentences[i+1]]

In [12]:
print(texts[10])
print(labels[10])
print('-------------------------------------------------------------------------------------------------------------------------')
print(texts[150])
print(labels[150])


['The', 'Pale', 'Kangaroo', 'Mouse', 'burrows', 'only', 'in', 'fine', 'sand', ',', 'while', 'the', 'Dark', 'Kangaroo', 'Mouse', 'prefers', 'fine', ',', 'gravelly', 'soils', 'but', 'may', 'also', 'burrow', 'in', 'sand', 'or', 'sandy', 'soil', '.']
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]
-------------------------------------------------------------------------------------------------------------------------
[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]


In [13]:
print(texts[0])
labels[0]

['In', '1832', 'his', 'family', 'emigrated', 'thence', 'to', 'Belleville', ',', 'Ontario', ',', 'where', 'he', 'apprenticed', 'with', 'the', 'printer', 'at', 'the', 'town', 'newspaper', ',', 'The', 'Belleville', 'Intelligencer', '.']


[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [14]:
len(texts) , len(labels)

(9169, 9169)

## On va mettre les labels dans un dictionnaire

In [15]:
label_vocab = collections.defaultdict(lambda: len(label_vocab))
label_vocab['<eos>'] = 0

int_labels = []
for label in labels:
    int_labels.append([label_vocab[token] for token in label])

print(int_labels[12])
print(label_vocab)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
defaultdict(<function <lambda> at 0x7f1e4ae809e0>, {'<eos>': 0, 0: 1, 1: 2})


## On va mettre les mots dans un dictionnaire pour les utiliser comme des entiers 

In [16]:
vocab = collections.defaultdict(lambda: len(vocab))
vocab['<eos>'] = 0

int_texts = []
for text in texts:
    int_texts.append([vocab[token.lower()] for token in text])

print(int_texts[12])
len(vocab)

[189, 199, 200, 201, 44, 202, 36, 203, 9, 204, 15, 205, 9, 156, 206, 9, 36, 207, 208, 209, 44, 15, 210, 21]


24028

In [17]:
rev_label_vocab = {y: x for x, y in label_vocab.items()}
rev_vocab = {y: x for x, y in vocab.items()}

In [18]:
collections.Counter ([len(h) for h in int_texts])  #dictionnaire qui compte les longueurs des phrases  

Counter({19: 12,
         20: 1150,
         21: 1056,
         22: 1038,
         23: 979,
         24: 911,
         25: 847,
         26: 710,
         27: 654,
         28: 615,
         29: 543,
         30: 498,
         31: 29,
         32: 15,
         33: 16,
         34: 20,
         35: 18,
         36: 15,
         37: 10,
         38: 8,
         39: 18,
         40: 7})

# Puisque la plupart des phrases ont une longueur inférieur à 30 , donc on va utiliser 30 comme longueur maximale.

In [20]:
max_len = 30
batch_size = 64 
embed_size = 300 
hidden_size = 128

In [21]:
X = torch.zeros(len(int_texts), max_len).long()
Y = torch.zeros(len(int_labels), max_len).long()

for i, (text, label) in enumerate(zip(int_texts, int_labels)):
    length = min(max_len, len(text))
    X[i,:length] = torch.LongTensor(text[:length])
    Y[i,:length] = torch.LongTensor(label[:length])

print(X[15])
print(Y[15])

tensor([233, 234, 235,   9,  15, 236, 237,  64, 238, 239, 240, 114,  61, 241,
        242,   9, 243, 244,  23, 245, 246,   9, 247, 175,   1, 248, 249,  15,
        250,  21])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 2, 1, 1, 1, 1])


In [22]:
X.shape

torch.Size([9169, 30])

## On va diviser la dataset en train_set, valid_set et test_set 

In [23]:
X_train = X[:7000]
Y_train = Y[:7000]
X_valid = X[7000:8000] 
Y_valid = Y[7000:8000]
X_test = X[8000:] 
Y_test = Y[8000:]

## On charger les X_train et X_valid dans des dataloader pour les generer en batchs dans l'entrainement

In [24]:
train_set = TensorDataset(X_train, Y_train)
valid_set = TensorDataset(X_valid, Y_valid)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size)

## Le modéle : 

In [45]:
class RNN(nn.Module):
    def __init__(self, label_vocab):
        super().__init__()
        self.embed = nn.Embedding(len(vocab), embed_size, padding_idx=vocab['<eos>'])
        self.rnn = nn.GRU(embed_size, hidden_size, bias=False, num_layers=1, bidirectional=False, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.decision = nn.Linear(hidden_size * 1 * 1, len(label_vocab))
        
    def forward(self, x):
        embed = self.embed(x)
        output, hidden = self.rnn(embed)
        output = self.decision(self.dropout(output))
        
        bias = torch.zeros([output.shape[0],30,3])

        k1 = np.random.choice(np.arange(output.shape[0]))
        k2 = np.random.choice(np.arange(output.shape[0]))
        k3 = np.random.choice(np.arange(output.shape[0]))

        for i in range(30):
          bias[k1][i][2] = 4
          bias[k2][i][2] = 4
          bias[k3][i][2] = 4

        return output + bias

rnn_model = RNN( label_vocab)

In [46]:
def perf(model, loader):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total_loss = correct = num_loss = num_perf = 0
    for x, y in loader:
      with torch.no_grad():
        y_scores = model(x)
        loss = criterion(y_scores.view(y.size(0) * y.size(1), -1), y.view(y.size(0) * y.size(1)))
        y_pred = torch.max(y_scores, 2)[1]
        mask = (y != 0)
        correct += torch.sum((y_pred.data == y) * mask)
        total_loss += loss.item()
        num_loss += len(y)
        num_perf += torch.sum(mask).item()
    return total_loss / num_loss, correct.item() / num_perf

perf(rnn_model, valid_loader)

(0.02055590772628784, 0.2598529290258614)

In [47]:
def fit(model, epochs, train_loader, valid_loader):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(filter(lambda param: param.requires_grad, model.parameters()))
    for epoch in range(epochs):
        model.train()
        total_loss = num = 0
        for x, y in train_loader:
            optimizer.zero_grad()
            y_scores = model(x)
            loss = criterion(y_scores.view(y.size(0) * y.size(1), -1), y.view(y.size(0) * y.size(1)))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            num += len(y)
        print(epoch, total_loss / num, *perf(model, valid_loader))

fit(rnn_model, 10, train_loader, valid_loader)

0 0.003726558431982994 0.0016889898255467415 0.9693877551020408
1 0.0016404710300266742 0.0014573358818888663 0.9721969759563744
2 0.0014062518030405044 0.001399023100733757 0.9715359828141783
3 0.0012175223167453493 0.0014173535630106926 0.9715359828141783
4 0.0010521938678409372 0.0014177590385079384 0.9728992811699578
5 0.0008806480323629719 0.0015140447616577147 0.97165991902834
6 0.000704438866781337 0.001635498858988285 0.9733124018838305
7 0.0005523494274488518 0.0017714847326278686 0.9701726844583988
8 0.00044461939364139524 0.0019332203716039657 0.96909857060233
9 0.0003546197298648102 0.002036939099431038 0.9696769396017516


## On calcule l'accuracy des prédictions des mots complexes dans la test_dataset

In [48]:
y_pred = torch.max(rnn_model(X_test), 2)[1]

total_number_of_words = y_pred.shape[0] * y_pred.shape[1]

train_acc = torch.sum(y_pred == Y_test)

final_train_acc = train_acc/total_number_of_words

print('accuracy = ',final_train_acc)

accuracy =  tensor(0.9769)


In [49]:
vrai_positifs = 0
faux_positifs = 0
faux_negatifs = 0
for i in range(y_pred.shape[0]):
  for j in range(y_pred.shape[1]):
    if (y_pred[i][j] == Y_test[i][j]) and (y_pred[i][j] == 2): vrai_positifs += 1
    if (y_pred[i][j] != Y_test[i][j]) and (y_pred[i][j] == 2): faux_positifs += 1
    if (y_pred[i][j] != Y_test[i][j]) and (y_pred[i][j] != 2): faux_negatifs += 1

recall = vrai_positifs/(vrai_positifs + faux_negatifs)
precision = vrai_positifs/(vrai_positifs + faux_positifs)
F1_score = 2*recall*precision/(recall + precision)

print('recall = ',recall)
print('precision = ',precision)
print('F1_score = ',F1_score)

recall =  0.12887438825448613
precision =  0.22253521126760564
F1_score =  0.16322314049586775


## On va voir la prédiction des mots complexes pour quelques phrases :

In [50]:
for k in range(30,55):
  print('the sentence is :'  ,' '.join([rev_vocab[int(X_test[k][i])] for i in range(len(X_test[k])) if rev_vocab[int(X_test[k][i])] != '<eos>']))

  predicted_labels = torch.max(rnn_model(X_test[k:k+1]) , 2)[1]
  
  mots_complexes = []
  for i in range(30):

    if int(predicted_labels[0][i]) == 2 :
      mots_complexes.append(rev_vocab[int(X_test[k][i])])
      
  print('complex words are :'  ,' , '.join(mots_complexes) ) 
  print('------------------------------------------------------------------------------------------- ') 

the sentence is : may he be guard , may he be shield , for ever may he bless and wield o graciously all deeds of thine , thou dearest country mine !
complex words are : guard , shield , graciously , deeds , thou , country , mine
------------------------------------------------------------------------------------------- 
the sentence is : it is found in algeria , egypt , iraq , israel , jordan , lebanon , libya , morocco , saudi arabia , syria , tunisia , and turkey .
complex words are : 
------------------------------------------------------------------------------------------- 
the sentence is : its natural habitats are temperate forests , subtropical or tropical dry shrubland , mediterranean-type shrubby vegetation , and rocky areas .
complex words are : shrubland
------------------------------------------------------------------------------------------- 
the sentence is : when the gaelic athletic association was founded in 1884 the english-origin name `` hurling '' was given to the 