In [1]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
import torch
import scipy as sklearn

import pandas as pd
import numpy as np
from matplotlib import rc
from io import StringIO
from html.parser import HTMLParser
from collections import defaultdict

from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader
from torch import nn, optim 

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PRE_TRAINED_MODEL_NAME = 'deepset/gbert-base'
qna_data = pd.read_csv('../data/faq_info_labels.csv')
mfaq = load_dataset("clips/mfaq", "de_flat")
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset mfaq (C:/Users/Adam/.cache/huggingface/datasets/clips___mfaq/de_flat/1.1.0/046d91e0a0390af15e8521190b906d67fd3d4440839559764d1659f48a8dbe7c)
100%|██████████| 2/2 [00:00<00:00,  3.25it/s]


In [2]:
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [None]:
"""
half_len = int(len(qna_data)/2)
data_matches = qna_data.iloc[:half_len,:]
data_distinct = qna_data.iloc[half_len:,:]

data_distinct = data_distinct.reset_index(drop=True)
print(data_distinct)
df1 = data_distinct.iloc[np.random.permutation(data_distinct.index)].reset_index(drop=True)
data_distinct['answer'] = df1['answer']
"""

In [None]:
"""
data_distinct['matching'] = 0
data_matches['matching'] = 1

df = pd.concat([data_matches,data_distinct])
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv('../data/faq_info_labels.csv', index=None, header=True)
"""

In [None]:
"""
import seaborn as sns
from pylab import rcParams
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 12, 8

q_token_lens = []
a_token_lens = []
for txt in df['question']:
    #print(txt)
    tokens = tokenizer.encode(txt, max_length=160, truncation=True)
    q_token_lens.append(len(tokens))
for txt in df['answer']:
    #print(txt)
    tokens = tokenizer.encode(txt, max_length=1024, truncation=True)
    a_token_lens.append(len(tokens))
#sns.distplot(q_token_lens)
#sns.distplot(a_token_lens)
"""

In [3]:
class QnADataset(Dataset):
    def __init__(self, question, answer, targets, tokenizer, max_length):
        self.question = question
        self.answer = answer
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = int(max_length)
        self.doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
        #self.q_len = int(q_len)
        #self.a_len = int(a_len)
    
    def __len__(self):
        return len(self.answer)

    def __getitem__(self, item):
        question = str(self.question[item])
        context = str(self.answer[item])
        target = self.targets[item]
        
        encoding = tokenizer(
            question,
            context,
            max_length = self.max_length,
            add_special_tokens = True,
            padding='max_length',
            truncation='only_second',
            return_attention_mask = True,
            return_token_type_ids = False,
            return_tensors = 'pt',
            #return_overflowing_tokens=True,
            #return_offsets_mapping=True,
            #stride=self.doc_stride
        )
        return {
            'question_text': question,
            'answer_text': context,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [4]:
a = QnADataset(
        question=qna_data.question.to_numpy(),
        answer=qna_data.answer.to_numpy(),
        targets=qna_data.matching.to_numpy(),
        tokenizer=tokenizer,
        max_length=1024
    )
a.__getitem__(0)

{'question_text': 'Ist es sinnvoll, dass sich Mitarbeiterinnen und Mitarbeiter, die in einem medizinischen Bereich arbeiten, regelmäßig testen lassen? ',
 'answer_text': 'Mit präventiven Reihentests in Krankenhäusern und Pflegeheimen und durch das Testen von Kontaktpersonen von Infizierten lassen sich Infektionsketten schnell erkennen und können besser unterbrochen werden. Die Nationale Teststrategie sieht vor, dass auch Personal in Krankenhäusern, Rehabilitationseinrichtungen und stationären und ambulanten Pflegeeinrichtungen vermehrt getestet werden. Als Kontaktpersonen sind Mitarbeiter, die COVID-19-Patienten betreuen, in jedem Falle regelmäßig zu testen. Bei Ausbrüchen in stationären Einrichtungen sollte auch das gesamte Personal einer Testung unterzogen werden. Auch regelmäßige Testungen im Rahmen z. B. von betriebsärztlichen Untersuchungen sind möglich. Außerdem kann das gesamte Personal, insbesondere in Gebieten mit erhöhten Infektionszahlen oder in der Betreuung von besonders v

In [None]:

df_train, df_test = train_test_split(
  qna_data,
  test_size=0.1,
  random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
  df_test,
  test_size=0.5,
  random_state=RANDOM_SEED
)

In [None]:
def create_data_loader(df, tokenizer, max_length, batch_size):
    ds = QnADataset(
        question=df.question.to_numpy(),
        answer=df.answer.to_numpy(),
        targets=df.matching.to_numpy(),
        tokenizer=tokenizer,
        max_length=max_length
    )
    return DataLoader(
        ds,
        batch_size = batch_size,
        shuffle=True,
        num_workers=2
    )

In [None]:
BATCH_SIZE = 8
MAX_LEN = 512

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
example = next(iter(train_data_loader))
example.keys()

In [None]:
class QnAClassifier(nn.Module):
    def __init__(self, n_classes):
        super(QnAClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
class_names = ['nomatch', 'match']
model = QnAClassifier(len(class_names))
model = model.to(device)
input_ids = example['input_ids'].to(device)
attention_mask = example['attention_mask'].to(device)

In [None]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

In [None]:
model = QnAClassifier(len(class_names))
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)
test_acc.item()

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  questions = []
  answers = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      question_text = d["question_text"]
      answer_text = d["answer_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      questions.extend(question_text)
      answers.extend(answer_text)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return questions, answers, predictions, prediction_probs, real_values

In [None]:
y_questions, y_answers, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
sample_question = "Hat der Indego einen Regensensor?"
sample_answer = "Nein, er kann aber auch bei leichtem (Niesel-)Regen mähen. Wir empfehlen allerdings, den Mäher nicht bei Regen arbeiten zu lassen, da dabei generell die Schnittqualität leidet bzw. Mäher und vor allem das Mähwerk stets stark verschmutzt werden. Mit der Betriebsart Smart Mow wird der Indego in die Lage versetzt, die optimale Zeit für das Schneiden deines Rasen automatisch zu planen, um Mähen bei Regen zu vermeiden."
sample_matching = 1
encoded_qna_pair = tokenizer.encode_plus(
  sample_question,
  sample_answer,
  max_length=MAX_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
)


In [None]:
input_ids = encoded_qna_pair['input_ids'].to(device)
attention_mask = encoded_qna_pair['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
print(f'Question: {sample_question}')
print(f'Answer: {sample_answer}')
print(f'Matching  : {class_names[prediction]}')


In [None]:
df_mfaq = mfaq.data['validation']
df_mfaq.num_rows

In [None]:
df_mfaq_q = pd.DataFrame.from_dict(df_mfaq['question'])
df_mfaq_a = pd.DataFrame.from_dict(df_mfaq['answer'])

In [None]:
df_mfaq_a = df_mfaq_a.rename(columns={0: "answer"})
df_mfaq_q = df_mfaq_q.rename(columns={0: "question"})

In [None]:
frames = [df_mfaq_q, df_mfaq_a]
df_mfaq = pd.concat(frames,axis=1, join="inner")

In [None]:
df_mfaq

In [None]:
half_len = int(len(df_mfaq)/2)
data_matches = df_mfaq.iloc[:half_len,:]
data_distinct = df_mfaq.iloc[half_len:,:]

data_distinct = data_distinct.reset_index(drop=True)
df1 = data_distinct.iloc[np.random.permutation(data_distinct.index)].reset_index(drop=True)
data_distinct['answer'] = df1['answer']
data_distinct['matching'] = 0
data_matches['matching'] = 1

df_mfaq = pd.concat([data_matches,data_distinct])
df_mfaq = df_mfaq.sample(frac=1).reset_index(drop=True)
#df.to_csv('../data/faq_info_labels.csv', index=None, header=True)


In [None]:
df_mfaq

In [None]:
mfaq_dl = create_data_loader(df_mfaq, tokenizer, MAX_LEN, BATCH_SIZE)
y_questions, y_answers, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  mfaq_dl
)
print(classification_report(y_test, y_pred, target_names=class_names))