## Instalation des depandances.

In [1]:
# ! pip install transformers

In [64]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

### Bert Tokenizer

In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [74]:
MAX_LEN = 50
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

## Les Donnees

In [40]:
train_data_path = "./Version 1/data/data_questions/questions-train.txt"
test_data_path =  "./Version 1/data/data_questions/questions-test.txt"

### Les classes de nos questions

In [41]:
classes = {
     'ABBREVIATION': 0,
     'DEFINITION': 1,
     'DESCRIPTION': 2,
     'ENTITY': 3,
     'LOCATION': 4,
     'ORGANIZATION': 5,
     'PERSON': 6,
     'QUANTITY': 7,
     'TEMPORAL': 8
}

In [42]:
def get_data(datapath):
    X = []
    Y = []
    with open(datapath, 'r') as file:
        lines = file.readlines()
        for line in lines:
            splited_line = line.split(" ")
            Y.append(classes[splited_line[0]])
            X.append(" ".join(splited_line[1:]))
    return X , Y

In [43]:
X , Y = get_data(train_data_path)
sentences = [tokenizer.tokenize(sentence) for sentence in X]

#### Trouver la taille de la question la plus long

In [45]:
# sentence_max_length = 0 

# for sentence in sentences:
#     sentence_max_len = max(sentence_max_len, len(sentence))

In [76]:
# def mask_inputs_for_bert(sentences ,sentence_max_length):
#     # Tokenize all of the sentences and map the tokens to thier word IDs.
#     input_ids = []
#     attention_masks = []
#     i = 0 
#     for sentence in sentences:
#         # “encode_plus* will:
#         # (1) Tokenize the sentence.
#         # (2) Prepend the “[CLS]" token to the start.
#         # (3) Append the “[SEP]* token to the end.
#         # (4) Map tokens to their IDs.
#         # (5) Pad or truncate the sentence to ‘max_length*while i < len(sentence):
#         # (6) Create attention masks for [PAD] tokens.

#         if(i<3):
#             print("Question", sentence)

#             encoded_dict = tokenizer.encode_plus(
#                             sentence, # Sentence to encode.
#                             add_special_tokens = True, # Add ‘[CLS]' and '[SEP]’
#                             max_length = sentence_max_len, # Pad & truncate all sentences.
#                             pad_to_max_length = True,
#                             return_attention_mask = True, # Construct attn. masks.
#                             )
#         if(i<3):

#             print("dict",encoded_dict['input_ids'])
#             # Add the encoded sentence to the list.

#             input_ids.append(encoded_dict['input_ids'])

#             # and its attention mask (simply differentiates padding from non-padding).
#             attention_masks.append(encoded_dict['attention_mask'])

#             i = i+1

#     #         convert to tensor and return

#             input_ids = torch.tensor(input_ids)
#             attention_masks = torch.tensor(attention_masks)
#             return input_ids,attention_masks


In [65]:
# X_train, train_mask = mask_inputs_for_bert(X , sentence_max_length)# In[ ]:def test_get_all_by_host_with_pool(self):
# y_train = torch.tensor(Y)

In [67]:
# bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)

In [68]:
# loss = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(params =  bert_model.parameters(), lr=1e-05)

In [69]:
# bert_model.compile(loss=loss, optimizer=optimizer)

In [52]:
# def loss_fn(outputs, targets):
#     return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [51]:
# optimizer = torch.optim.Adam(params =  bert_model.parameters(), lr=1e-05)

In [54]:
# def train(epoch):
#     bert_model.train()
#     for _,data in enumerate(training_loader, 0):
#         ids = data['ids'].to(device, dtype = torch.long)
#         mask = data['mask'].to(device, dtype = torch.long)
#         token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#         targets = data['targets'].to(device, dtype = torch.float)

#         outputs = model(ids, mask, token_type_ids)

#         optimizer.zero_grad()
#         loss = loss_fn(outputs, targets)
#         if _%5000==0:
#             print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

In [90]:
class CustomDataset(Dataset):

    def __init__(self, X, y, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.sentences = X
        self.targets = y
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        question = self.sentences[index]
        inputs = self.tokenizer.encode_plus(
            question,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [96]:
training_set = CustomDataset(X, Y, tokenizer, MAX_LEN)

In [None]:
training_set.

[['how', 'far', 'is', 'it', 'from', 'denver', 'to', 'aspen', '?'],
 ['what', 'county', 'is', 'modest', '##o', ',', 'california', 'in', '?'],
 ['who', 'was', 'galileo', '?'],
 ['what', 'is', 'an', 'atom', '?'],
 ['when', 'did', 'hawaii', 'become', 'a', 'state', '?'],
 ['how', 'tall', 'is', 'the', 'sears', 'building', '?'],
 ['why', 'does', 'the', 'moon', 'turn', 'orange', '?'],
 ['what', 'is', 'autism', '?'],
 ['what', 'city', 'had', 'a', 'world', 'fair', 'in', '1900', '?'],
 ['what', 'person', "'", 's', 'head', 'is', 'on', 'a', 'dime', '?'],
 ['what',
  'is',
  'the',
  'average',
  'weight',
  'of',
  'a',
  'yellow',
  'labrador',
  '?'],
 ['who',
  'was',
  'the',
  'first',
  'man',
  'to',
  'fly',
  'across',
  'the',
  'pacific',
  'ocean',
  '?'],
 ['when', 'did', 'idaho', 'become', 'a', 'state', '?'],
 ['what',
  'is',
  'the',
  'life',
  'expect',
  '##ancy',
  'for',
  'cricket',
  '##s',
  '?'],
 ['who',
  'developed',
  'the',
  'va',
  '##cci',
  '##nation',
  'against',