# **Setup**

In [22]:
!pip install transformers



In [23]:
!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt

--2021-03-20 15:25:43--  https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.97.181
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.97.181|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 231508 (226K) [text/plain]
Saving to: ‘bert-base-uncased-vocab.txt.1’


2021-03-20 15:25:43 (917 KB/s) - ‘bert-base-uncased-vocab.txt.1’ saved [231508/231508]



In [24]:
from google.colab import drive
drive.mount('/content/drive')
path="/content/drive/My Drive/Colab Notebooks/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from transformers import TFBertModel
from tokenizers import BertWordPieceTokenizer

In [26]:
tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)

train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"

train_path = keras.utils.get_file("train.json", train_data_url)
eval_path = keras.utils.get_file("eval.json", eval_data_url)

max_len = 400

with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)

train_all = False


# **Preprocessing data**

In [27]:
def convert_squad2data(raw_data):
    squad_data = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                has_answer = True if qa['answers'] else False
                if has_answer:
                    answer_text = qa["answers"][0]["text"]
                    all_answers = [_["text"] for _ in qa["answers"]]
                    answer_start_idx = qa["answers"][0]["answer_start"]
                    squad_sample = SquadDataSample(question, context, answer_start_idx, answer_text, all_answers)
                else:
                    squad_sample = SquadDataSample(question, context)
                squad_sample.preprocess()
                squad_data.append(squad_sample)
    return squad_data


def create_input_dataset(squad_samples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for sample in squad_samples:
        if not sample.skip:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(sample, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

In [28]:
class SquadDataSample:
    def __init__(self, question, context, answer_start_idx=0, answer_text=None, all_answers=None):
        self.question = question
        self.context = context
        self.answer_start_idx = answer_start_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        answer_start_idx = self.answer_start_idx

        context = " ".join(str(context).split())
        question = " ".join(str(question).split())

        tokenized_context, tokenized_question = self.tokenize_data(context, question)

        if self.answer_text is not None:
            answer = " ".join(str(answer_text).split())
            answer_end_idx = answer_start_idx + len(answer)
            if answer_end_idx >= len(context):
                self.skip = True
                return
            is_char_in_ans = [0] * len(context)
            for idx in range(answer_start_idx, answer_end_idx):
                is_char_in_ans[idx] = 1
            answer_token_idx = []
            for idx, (start, end) in enumerate(tokenized_context.offsets):
                if sum(is_char_in_ans[start:end]) > 0:
                    answer_token_idx.append(idx)
            if len(answer_token_idx) == 0:
                self.skip = True
                return
            start_token_idx = answer_token_idx[0]
            end_token_idx = answer_token_idx[-1]
        else:
            start_token_idx = 0
            end_token_idx = 0

        attention_mask, input_ids, token_type_ids = self.create_sample_features(tokenized_context, tokenized_question)

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets

    def create_sample_features(self, tokenized_context, tokenized_question):
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_len - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            self.skip = True
        return attention_mask, input_ids, token_type_ids

    def tokenize_data(self, context, question):
        tokenized_context = tokenizer.encode(context)
        tokenized_question = tokenizer.encode(question)
        return tokenized_context, tokenized_question


In [29]:
train_squad_samples = convert_squad2data(raw_train_data)
eval_squad_samples = convert_squad2data(raw_eval_data)

if train_all:
    x_train, y_train = create_input_dataset(train_squad_samples)
    x_eval, y_eval = create_input_dataset(eval_squad_samples)
    x_test, y_test = x_eval, y_eval

    print("num of train data: " + str(len(train_squad_samples)))
    print("num of eval data: " + str(len(eval_squad_samples)))
else:
    n_train = 30000
    n_eval = 5000
    n_test = 600
    x_train, y_train = create_input_dataset(train_squad_samples[:n_train])
    x_eval, y_eval = create_input_dataset(eval_squad_samples[:n_eval])
    x_test, y_test = create_input_dataset(eval_squad_samples[n_eval:n_eval + n_test])

    print("num of train data used for train: " + str(len(train_squad_samples[:n_train])))
    print("num of eval data used for eval: " + str(len(eval_squad_samples[:n_eval])))
    print("num of train data used for test: " + str(len(eval_squad_samples[n_eval:n_eval + n_test])))



num of train data used for train: 30000
num of eval data used for eval: 5000
num of train data used for test: 600


# **Create model**

In [30]:
def create_model(learning_rate=5e-5, decay=1e-6):
    bert_model = TFBertModel.from_pretrained("bert-base-uncased")

    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = bert_model.bert(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=learning_rate, decay=decay)
    model.compile(optimizer=optimizer, loss=[loss, loss], metrics=['accuracy'])
    return model

# **Train and Evaluate model**


Training model

In [31]:
# model = create_model(learning_rate=5e-5, decay=1e-6)
# model.summary()

In [None]:
epochs = 3
batch_size = 8
model = create_model(learning_rate=3e-5, decay=1e-6)
# model.summary()

model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
      validation_data=(x_eval, y_eval))

model.save(path+"save/model.h5")

print("TRAINING PROCESS DONE!!!!!!!!!!!!!")


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/3

In [None]:
Evaluate model

In [None]:
model =keras.models.load_model(path+"save/model.h5")

model.evaluate(x_test,y_test,batch_size=batch_size)

Test model with custom data

In [None]:
test_data = {"data":
    [
        {"title": "Project Apollo",
         "paragraphs": [
             {
                "context": "The pound-force has a metric counterpart, less commonly used than the newton: the kilogram-force (kgf) (sometimes kilopond), is the force exerted by standard gravity on one kilogram of mass. The kilogram-force leads to an alternate, but rarely used unit of mass: the metric slug (sometimes mug or hyl) is that mass that accelerates at 1 m\u00b7s\u22122 when subjected to a force of 1 kgf. The kilogram-force is not a part of the modern SI system, and is generally deprecated; however it still sees use for some purposes as expressing aircraft weight, jet thrust, bicycle spoke tension, torque wrench settings and engine output torque. Other arcane units of force include the sth\u00e8ne, which is equivalent to 1000 N, and the kip, which is equivalent to 1000 lbf.",
                 "qas": [
                         {"question": "What is the metric term less used than the Newton?", "id": "5737aafd1c456719005744fb", 
                          "answers": [{"text": "kilogram-force", "answer_start": 82}, 
                                      {"text": "pound-force", "answer_start": 4}, 
                                      {"text": "kilogram-force (kgf)", "answer_start": 82}, 
                                      {"text": "kilogram-force", "answer_start": 82}, 
                                      {"text": "the kilogram-force (", "answer_start": 78}]}, 
                         {"question": "What is the kilogram-force sometimes reffered to as?", "id": "5737aafd1c456719005744fc", 
                          "answers": [{"text": "kilopond", "answer_start": 114}, {"text": "kilopond", "answer_start": 114}, {"text": "kilopond", "answer_start": 114}, {"text": "kilopond", "answer_start": 114}, {"text": "kilopond", "answer_start": 114}]}, 
                         {"question": "What is a very seldom used unit of mass in the metric system?", "id": "5737aafd1c456719005744fd", 
                          "answers": [{"text": "slug", "answer_start": 274}, {"text": "metric slug", "answer_start": 267}, {"text": "metric slug", "answer_start": 267}, {"text": "metric slug", "answer_start": 267}, {"text": "the metric slug", "answer_start": 263}]}, 
                         {"question": "What seldom used term of a unit of force equal to 1000 pound s of force?", "id": "5737aafd1c456719005744fe", 
                          "answers": [{"text": "kip", "answer_start": 712}, {"text": "kip", "answer_start": 712}, {"text": "kip", "answer_start": 712}, {"text": "kip", "answer_start": 712}, {"text": "kip", "answer_start": 712}]}, 
                         {"question": "What is the seldom used force unit equal to one thousand newtons?", "id": "5737aafd1c456719005744ff", 
                          "answers": [{"text": "sth\u00e8ne", "answer_start": 665}, {"text": "sth\u00e8ne", "answer_start": 665}, {"text": "sth\u00e8ne", "answer_start": 665}, {"text": "sth\u00e8ne", "answer_start": 665}, {"text": "sth\u00e8ne", "answer_start": 665}]}, 
]}]}]}


test_samples = convert_squad2data(test_data)
x_test, _ = create_input_dataset(test_samples)

pred_start, pred_end = model.predict(x_test)

for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
    test_sample = test_samples[idx]
    offsets = test_sample.context_token_to_char
    start = np.argmax(start)
    end = np.argmax(end)
    pred_ans = None
    if start >= len(offsets):
        continue
    pred_char_start = offsets[start][0]
    if end < len(offsets):
        pred_char_end = offsets[end][1]
        pred_ans = test_sample.context[pred_char_start:pred_char_end]
    else:
        pred_ans = test_sample.context[pred_char_start:]

    print("Question: " + test_sample.question)
    print("Predict answer: " + pred_ans)
    if test_samples[idx].answer_text:
        print("Correct answer: " + test_samples[idx].answer_text)
        for a in test_samples[idx].all_answers:
            print("All possible answer: " + a)
    print("+++++++++++++++++++++++++++++++++++++++++\n")


# **Question-Answering Bot**

In [None]:
!pip install discord

In [None]:
def convert_input2squad(context, question):
    data = []
    para = []
    qas = []
    qa = {
        "question": question,
        "answers": [],
        "id": 0
    }
    qas.append(qa)
    p = {
        "context": context,
        "qas": qas
    }

    para.append(p)
    paras = {
        "paragraphs": para
    }
    data.append(paras)

    squad_data = {
        "data": data
    }
    return squad_data


def predict_answer(question, context):
    data = convert_input2squad(context, question)
    samples = convert_squad2data(data)
    x_test, _ = create_input_dataset(samples)

    pred_start, pred_end = model.predict(x_test)

    for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
        sample = samples[idx]
        offsets = sample.context_token_to_char
        start = np.argmax(start)
        end = np.argmax(end)
        pred_answer = None
        if start >= len(offsets):
            continue
        pred_char_start = offsets[start][0]
        if end < len(offsets):
            pred_char_end = offsets[end][1]
            pred_answer = sample.context[pred_char_start:pred_char_end]
        else:
            pred_answer = sample.context[pred_char_start:]
    return pred_answer

In [None]:
import discord
import nest_asyncio
import time
nest_asyncio.apply()

class MyClient(discord.Client):
    async def on_ready(self):
        print('Logged in as')
        print(self.user.name)
        print(self.user.id)
        print('------')

    async def on_message(self, message):
        if message.author.id == self.user.id:
            return
        else:
            question = message.content
            start = time.time()
            answer = call_model(question)
            print("answer: ", answer)

            if answer:
              await message.channel.send(answer.format(message))
            else:
              await message.channel.send("Cannot found the answer!".format(message))
              
            end = time.time()
            print("time: ",end - start)


client = MyClient()
client.run('ODE4Nzk2NDAzMDEwMTc0OTc3.YEdRkg.aDIJhTI6aNL6HaUk0RADiI_J-DM')