## Train - Question Generation 

In [1]:
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
from utils import T5PegasusTokenizer
from transformers.models.mt5.modeling_mt5 import MT5ForConditionalGeneration
from datasets import load_dataset

model_path = 'imxly/t5-pegasus'
device = torch.device('cuda')

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
set_seed(42)

t5_model = MT5ForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = T5PegasusTokenizer.from_pretrained(model_path)

2022-11-30 06:26:51.280075: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-30 06:26:51.468000: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-30 06:26:52.416086: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-30 06:26:52.416192: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
data = load_dataset("xquad","xquad.zh")
data = data['validation']

data = pd.DataFrame(data)

data['question'] = data['question']
data['context'] = data['context']
data['answers'] = list(map(lambda x: x['text'][0], data['answers']))

data['input'] = 'question: '+'<answer>' + data['answers'] + '<context>' + data['context']
data['label'] = data['question']

input_data = list(zip(data['input'],data['label']))

Found cached dataset xquad (/home/jupyter-daniel/.cache/huggingface/datasets/xquad/xquad.zh/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in t5_model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in t5_model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)



In [4]:
mt5_model.train()

epochs = 5

for epoch in range(epochs):
  print ("epoch ",epoch)
  for input,output in input_data:
    input_sent = "qa-generation :"+input+ "</s> "
    ouput_sent = output+"</s>  "

    tokenized_inp = tokenizer.encode_plus(input_sent,  max_length=512, pad_to_max_length=True,return_tensors="pt")
    tokenized_output = tokenizer.encode_plus(ouput_sent, max_length=100, pad_to_max_length=True,return_tensors="pt")


    input_ids  = tokenized_inp["input_ids"].to(device)
    attention_mask = tokenized_inp["attention_mask"].to(device)

    lm_labels= tokenized_output["input_ids"].to(device)
    decoder_attention_mask=  tokenized_output["attention_mask"].to(device)

    # the forward function automatically creates the correct decoder_input_ids
    output = mt5_model(input_ids=input_ids, labels=lm_labels,decoder_attention_mask=decoder_attention_mask,attention_mask=attention_mask)
    loss = output[0]

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


epoch  0


Loading model cost 0.772 seconds.
Prefix dict has been built successfully.


epoch  1
epoch  2
epoch  3
epoch  4
epoch  5
epoch  6
epoch  7
epoch  8
epoch  9


In [None]:
# Saving the model & tokenizer
mt5_model.save_pretrained("final-mt5")
tokenizer.save_pretrained("final-mt5")

## Answer Generation 

In [1]:
from utils import T5PegasusTokenizer
from transformers.models.mt5.modeling_mt5 import MT5ForConditionalGeneration
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import jieba

from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import torch
device = torch.device("cuda")

def answer_generation(context,answer="",lang="") :

    # Manual Setting the Answer 

    if answer == "" :
        if lang == "chi":
        # Generate Answer from KeyBERT 
            context = " ".join(jieba.cut(context))

        sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
        kw_model = KeyBERT(model=sentence_model)
        keywords = kw_model.extract_keywords(context,keyphrase_ngram_range=(1,2),use_mmr=True,diversity=0.9,top_n=3)

        answer_list = []

        for keyword in keywords :
            answer_list.append(keyword[0])
    else :
        answer_list = []
        answer_list = [answer]
    return context,answer_list 

def question_generation(context,answer_list:list,lang="") :
    
    if lang =="en":
        eng_model_path = "iarfmoose/t5-base-question-generator"
        model = T5ForConditionalGeneration.from_pretrained(eng_model_path).to(device)
        tokenizer = T5Tokenizer.from_pretrained(eng_model_path)
        
    else :
        chi_model_path = "final-mt5"
        model = MT5ForConditionalGeneration.from_pretrained(chi_model_path).to(device)
        tokenizer = T5PegasusTokenizer.from_pretrained(chi_model_path)
    
    # Generate Question based on answer     
    
    output_list = {"question":[],"answer":[]}

    for i in range(len(answer_list)) :
        format_input = "qa-question: " + "<answer>" + answer_list[i] + '<context>' + context

        input_ids = tokenizer.encode(format_input, return_tensors='pt').to(device)
        output = model.generate(input_ids,
                            decoder_start_token_id=tokenizer.cls_token_id, #101
                            eos_token_id=tokenizer.sep_token_id, #102
                            max_length=64)
        
        result = tokenizer.decode(output[0])
        result = result.replace("<pad> ","")
        result = result.replace("</s>","")
        
        if lang=="chi":
            result = ''.join(result).replace(' ', '')
            result = result.replace("[CLS]","")
            result = result.replace("</s>[SEP]","")
        
        output_list['question'].append(result)
        output_list['answer'].append(answer_list[i])
        
    return output_list

2022-12-02 07:10:40.768281: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-02 07:10:41.676820: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-02 07:10:43.341816: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-02 07:10:43.341974: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [4]:
import pandas as pd 

eng_context = "Wistron Corporation is an electronics manufacturer based in Taiwan. It was the manufacturing arm of Acer Inc. before being spun off in 2000. As an original design manufacturer, the company designs and manufactures products for other companies to sell under their brand name. Wistron products include notebook and desktop computers, servers, storage, LCD TVs, handheld devices, and devices and equipment for medical applications."
chi_context = """
纬创资通股份有限公司，简称纬创，是一家ODM企业，于2001年由宏碁拆分出来，营运总部位于台湾。纬创资通是全球资讯产品主要供应商之一，全球员工逾80,000名。主要产品包括可携式电脑系统、桌上型电脑系统、伺服器及网路储存设备、资讯家电、通讯产品、云端及绿资源技术。"""

context,ans = answer_generation(eng_context,answer="",lang="en")

result_dict = question_generation(context,ans,lang="en")

qa_df = pd.DataFrame(result_dict)

qa_df

Unnamed: 0,question,answer
0,what is the name of the company???????????????...,wistron corporation
1,What is the name of the company??,desktop computers
2,When was Wistron Corporation spun off??,spun 2000
