In [109]:
import pandas as pd
import numpy as np

# SKLEARN METRICS
from sklearn.metrics import accuracy_score

# PyTORCH
import torch

from torch import nn
from torch import optim
from torch.utils.data import DataLoader

# HUGGING FACE
from datasets import Dataset, load_dataset

from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer

import evaluate

# SPACY
import spacy
import spacy_dbpedia_spotlight

# SPARQL
from SPARQLWrapper import SPARQLWrapper, JSON

# OLLAMA
import ollama

# TRANSITIONS
from transitions import Machine

# --------------------------------
import warnings
warnings.filterwarnings("ignore")

# Dataset Processing

In [2]:
data_files = {"train": "train.json", "validation": "valid.json", "test": "test.json"}
dataset = load_dataset(path="data//", data_files=data_files)

train = pd.DataFrame(dataset['train']['Questions'])
valid = pd.DataFrame(dataset['validation']['Questions'])
test = pd.DataFrame(dataset['test']['Questions'])

In [3]:
def process_dataset(data):
    data = data[data.PredicateList.str.len() == 1].reset_index()
    data.PredicateList = data.PredicateList.str[0]
    data['Constraint'] = data.PredicateList.str['Constraint']
    data['Direction'] = data.PredicateList.str['Direction']
    data['Predicate'] = data.PredicateList.str['Predicate']

    return data[['Query', 'Direction', 'Subject', 'Predicate', 'Constraint']]

In [4]:
processed_train = process_dataset(train)
processed_valid = process_dataset(valid)
processed_test = process_dataset(test)

In [5]:
processed_train.head()

Unnamed: 0,Query,Direction,Subject,Predicate,Constraint
0,what movie is produced by warner bros.,backward,http://dbpedia.org/resource/Warner_Bros.,http://dbpedia.org/ontology/distributor,http://dbpedia.org/ontology/Film
1,What is don graham known as?,forward,http://dbpedia.org/resource/Don_Graham_(Americ...,http://purl.org/linguistics/gold/hypernym,
2,Which city did the artist ryna originate in,forward,http://dbpedia.org/resource/RYNA,http://dbpedia.org/ontology/hometown,
3,who produced the film rough house rosie,forward,http://dbpedia.org/resource/Rough_House_Rosie,http://dbpedia.org/ontology/producer,
4,what is the language in which mera shikar was ...,forward,http://dbpedia.org/resource/Mera_Shikar,http://dbpedia.org/ontology/language,


In [12]:
random_question = processed_train.Query.sample(1).values[0]

random_question

'which musician recorded metal blade records'

# Entity Recognition

In [110]:
def get_entities(question):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("dbpedia_spotlight", config={"verify_ssl": False})

    doc = nlp(question)[-1]
    entities = [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents]

    return entities

In [67]:
get_entities(random_question)

[('metal blade records',
  'DBPEDIA_ENT',
  'http://dbpedia.org/resource/Metal_Blade_Records')]

# Predicate Classification

## Preparing Training Data

In [15]:
predicates = processed_train.Predicate.unique()

len(predicates)

100

In [16]:
pred_id2label = {ind: predicate for ind, predicate in enumerate(predicates)}
pred_label2id = {predicate: ind for ind, predicate in pred_id2label.items()}

In [17]:
predicate_train = processed_train[["Query", "Predicate"]]
predicate_train.Predicate = predicate_train.Predicate.map(pred_label2id)

predicate_valid = processed_valid[["Query", "Predicate"]]
predicate_valid.Predicate = predicate_valid.Predicate.map(pred_label2id)

predicate_test = processed_test[["Query", "Predicate"]]
predicate_test.Predicate = predicate_test.Predicate.map(pred_label2id)

In [18]:
predicate_train

Unnamed: 0,Query,Predicate
0,what movie is produced by warner bros.,0
1,What is don graham known as?,1
2,Which city did the artist ryna originate in,2
3,who produced the film rough house rosie,3
4,what is the language in which mera shikar was ...,4
...,...,...
23811,What sort of music does tevin campbell perform,10
23812,what kind of composition is twistin' the night...,1
23813,which artist composes video game music,10
23814,What gender is gastón filgueira,12


In [19]:
predicate_train = Dataset.from_pandas(predicate_train)
predicate_valid = Dataset.from_pandas(predicate_valid)
predicate_test = Dataset.from_pandas(predicate_test)

## Fine-Tuning

In [20]:
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
predicate_classifier = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                                          num_labels=len(pred_label2id),
                                                                          id2label=pred_id2label, label2id=pred_label2id,
                                                                          ignore_mismatched_sizes=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([100]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([100, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
def tokenize(examples):
    return tokenizer(examples["Query"], padding="max_length", truncation=True, return_tensors='pt')

In [36]:
tokenized_train = predicate_train.map(tokenize, batched=True)
tokenized_valid = predicate_valid.map(tokenize, batched=True)
tokenized_test = predicate_test.map(tokenize, batched=True)

tokenized_train = tokenized_train.map(lambda x: {"labels": x["Predicate"]}, batched=True)
tokenized_valid = tokenized_train.map(lambda x: {"labels": x["Predicate"]}, batched=True)
tokenized_test = tokenized_test.map(lambda x: {"labels": x["Predicate"]}, batched=True)

Map:   0%|          | 0/23816 [00:00<?, ? examples/s]

Map:   0%|          | 0/3407 [00:00<?, ? examples/s]

Map:   0%|          | 0/6847 [00:00<?, ? examples/s]

Map:   0%|          | 0/23816 [00:00<?, ? examples/s]

Map:   0%|          | 0/23816 [00:00<?, ? examples/s]

Map:   0%|          | 0/6847 [00:00<?, ? examples/s]

In [23]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [37]:
training_args = TrainingArguments(
    output_dir="./predicate_classifier",
    learning_rate=3e-5,
    eval_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    fp16=True,
    logging_dir="./logs",
)

trainer = Trainer(
    model=predicate_classifier,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.501,0.403231,0.909767
2,0.4668,0.249886,0.939998
3,0.2554,0.192463,0.950663
4,0.2351,0.166421,0.95734


TrainOutput(global_step=2980, training_loss=0.500971695560737, metrics={'train_runtime': 1562.5847, 'train_samples_per_second': 60.966, 'train_steps_per_second': 1.907, 'total_flos': 1.264142903083008e+16, 'train_loss': 0.500971695560737, 'epoch': 4.0})

In [38]:
predicate_classifier.save_pretrained("./predicate_classifier/model")
tokenizer.save_pretrained("./predicate_classifier/tokenizer")

('./predicate_classifier/tokenizer\\tokenizer_config.json',
 './predicate_classifier/tokenizer\\special_tokens_map.json',
 './predicate_classifier/tokenizer\\vocab.txt',
 './predicate_classifier/tokenizer\\added_tokens.json',
 './predicate_classifier/tokenizer\\tokenizer.json')

## Prediction

In [60]:
def get_predicate(question):
    tokenizer = AutoTokenizer.from_pretrained("./predicate_classifier/tokenizer")
    predicate_classifier = AutoModelForSequenceClassification.from_pretrained("./predicate_classifier/model").cpu()

    predicate_classifier.eval()

    tokenized_q = tokenizer(question, padding="max_length", truncation=True, return_tensors='pt')
    with torch.no_grad():
        logits = predicate_classifier(**tokenized_q).logits.softmax(-1)
        pred = logits.argmax(-1).item()
        confidence = logits.max(-1).values.item()
        predicate = predicate_classifier.config.id2label[pred]

    return predicate, confidence

In [61]:
get_predicate(random_question)

('http://dbpedia.org/ontology/recordLabel', 0.9973101615905762)

# Question Direction Classification

## Preparing Training Data

In [68]:
tokenizer = AutoTokenizer.from_pretrained("./predicate_classifier/tokenizer")

In [69]:
type_id2label = {0: 'forward', 1: 'backward'}
type_label2id = {'forward': 0, 'backward': 1}

In [70]:
type_train = processed_train[['Query', 'Direction']]
type_valid = processed_valid[['Query', 'Direction']]
type_test = processed_valid[['Query', 'Direction']]

type_train.Direction = type_train.Direction.map(type_label2id)
type_valid.Direction = type_valid.Direction.map(type_label2id)
type_test.Direction = type_test.Direction.map(type_label2id)

In [71]:
class QueryTypeDataset():
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        q, t = self.data.iloc[idx]
        tokenized = tokenizer(q, padding="max_length", truncation=True, return_tensors="pt")['input_ids'].squeeze(0)
        
        return tokenized, t

    def __len__(self):
        return len(self.data)

In [72]:
train_type_dataset = QueryTypeDataset(type_train, tokenizer)
valid_type_dataset = QueryTypeDataset(type_valid, tokenizer)
test_type_dataset = QueryTypeDataset(type_test, tokenizer)

In [73]:
train_dataloader = DataLoader(train_type_dataset, batch_size=16)
valid_dataloader = DataLoader(valid_type_dataset, batch_size=16)
test_type_dataset = DataLoader(test_type_dataset, batch_size=len(test_type_dataset))

## Training

In [74]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [112]:
class TypeClassifier(nn.Module):
    def __init__(self, vocab_size=30522, embed_dim=32, hidden_size=16, n_classes=2, id2label=type_id2label, label2id=type_label2id):
        super().__init__()
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                      embedding_dim=embed_dim, 
                                      padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, n_classes)

        self.id2label = id2label
        self.label2id = label2id

    def forward(self, X):
        emb = self.embedding(X)
        out, h = self.gru(emb)
    
        h = torch.cat((h[0], h[1]), axis=1)
        pred = self.fc(h)

        return pred

In [95]:
def train_loop(model, dataloader, loss, optimizer, verbose=False):
    model.train()
    epoch_loss = 0

    ground_truth = []
    predictions = []

    for idx, (X, y) in enumerate(dataloader):
        X, y = X.to(DEVICE), y.to(DEVICE)

        predict_logits = model(X)
        batch_loss = loss(predict_logits, y.flatten())

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        
        epoch_loss += batch_loss.item()

        ground_truth.extend(y.flatten().tolist())
        predictions.extend(predict_logits.argmax(-1).tolist())

        if verbose and idx % verbose == 0:
            print(f'Loss: {round(batch_loss.item(), 5)}')

    epoch_loss /= len(dataloader)
    accuracy = accuracy_score(ground_truth, predictions)
    
    if verbose:
        print(f'Train Loss: {round(epoch_loss, 5)}\t Train Accuracy: {round(accuracy, 5)*100}%')

    return epoch_loss, accuracy, ground_truth, predictions

In [96]:
def test_loop(model, dataloader, loss, verbose=False):
    model.eval()
    epoch_loss = 0

    ground_truth = []
    predictions = []

    with torch.no_grad():
        for idx, (X, y) in enumerate(dataloader):
            X, y = X.to(DEVICE), y.to(DEVICE)
    
            predict_logits = model(X)
            batch_loss = loss(predict_logits, y.flatten())

            epoch_loss += batch_loss.item()

            ground_truth.extend(y.flatten().tolist())
            predictions.extend(predict_logits.argmax(-1).tolist())
    
        epoch_loss /= len(dataloader)
        accuracy = accuracy_score(ground_truth, predictions)
    
        if verbose:
            print(f'Test Loss: {round(epoch_loss, 5)}\t Test Accuracy: {round(accuracy, 5)*100}%\n')
    
        return epoch_loss, accuracy, ground_truth, predictions

In [99]:
type_classifier = TypeClassifier(vocab_size=tokenizer.vocab_size,
                                 embed_dim=32,
                                 hidden_size=16,
                                 n_classes=2, 
                                 id2label=type_id2label, label2id=type_label2id).to(DEVICE)
loss = nn.CrossEntropyLoss()

lr = 1e-3
optimizer = optim.Adam(type_classifier.parameters(), lr=lr, weight_decay=1e-4)

n_epoch = 3
print_every = 1
verbose_every = 500
verbose = False

In [100]:
for epoch in range(n_epoch + 1):
    if epoch % print_every == 0:
        print(f'EPOCH {epoch}\n---------------')
        verbose = verbose_every
    else:
        verbose = False

    train_res = train_loop(type_classifier, train_dataloader, loss, optimizer, verbose)
    test_res = test_loop(type_classifier, valid_dataloader, loss, verbose)

EPOCH 0
---------------
Loss: 0.70427
Loss: 0.49073
Loss: 0.27611
Train Loss: 0.21841	 Train Accuracy: 91.455%
Test Loss: 0.13994	 Test Accuracy: 95.216%

EPOCH 1
---------------
Loss: 0.16254
Loss: 0.36628
Loss: 0.23328
Train Loss: 0.10459	 Train Accuracy: 96.536%
Test Loss: 0.11281	 Test Accuracy: 96.00800000000001%

EPOCH 2
---------------
Loss: 0.0798
Loss: 0.16803
Loss: 0.19163
Train Loss: 0.07437	 Train Accuracy: 97.733%
Test Loss: 0.09662	 Test Accuracy: 96.74199999999999%

EPOCH 3
---------------
Loss: 0.03317
Loss: 0.06721
Loss: 0.16672
Train Loss: 0.05392	 Train Accuracy: 98.455%
Test Loss: 0.09287	 Test Accuracy: 97.182%



In [101]:
torch.save(type_classifier.state_dict(), "./type_classifier/type_classifer.pth")

## Prediction

In [107]:
def get_direction(question):
    type_classifier = TypeClassifier(vocab_size=tokenizer.vocab_size,
                                 embed_dim=32,
                                 hidden_size=16,
                                 n_classes=2, 
                                 id2label=type_id2label, label2id=type_label2id).cpu()
    type_classifier.load_state_dict(torch.load("./type_classifier/type_classifer.pth", weights_only=True))
    type_classifier.eval()

    tokenized_q = tokenizer(question, padding="max_length", truncation=True, return_tensors='pt')['input_ids']
    with torch.no_grad():
        logits = type_classifier(tokenized_q).softmax(-1)
        pred = logits.argmax(-1).item()
        confidence = logits.max(-1).values.item()
        predicate = type_classifier.id2label[pred]

    return predicate, confidence

In [108]:
get_direction(random_question)

('backward', 0.9966727495193481)

# Building a QnA system

In [181]:
class QnAAgent():
    states = ['start',
              'ner_success', 'ner_fail',
              'predicate_success', 'predicate_fail',
              'type_success', 'type_fail',
              'query_success', 'query_fail']
    
    def __init__(self):
        # NER
        self.ner_model = spacy.load("en_core_web_sm")
        self.ner_model.add_pipe("dbpedia_spotlight", config={"verify_ssl": False})

        # PREDICATE
        self.tokenizer = AutoTokenizer.from_pretrained("./predicate_classifier/tokenizer")
        self.predicate_classifier = AutoModelForSequenceClassification.from_pretrained("./predicate_classifier/model").cpu()
        self.predicate_classifier.eval()

        # DIRECTION TYPE
        self.type_classifier = TypeClassifier()
        self.type_classifier.load_state_dict(torch.load("./type_classifier/type_classifer.pth", weights_only=True))
        self.type_classifier.eval()

        # QUERY TEMPLATES
        self.query_templates = {'backward':'SELECT DISTINCT ?subj WHERE {{?subj <{predicate}> <{entity}>}}',
                                'forward':'SELECT DISTINCT ?obj WHERE {{<{entity}> <{predicate}> ?obj}}'}

        # SPARQL AND LLM
        self.sparql = SPARQLWrapper("https://dbpedia.org/sparql")
        self.llm = ollama

        self.entity = None
        self.predicate = None
        self.type = None
        self.answer = None

        # TRANSITIONS
        self.machine = Machine(model=self, states=QnAAgent.states, send_event=True, initial='start')

        self.machine.add_transition(trigger='proceed', source='start', 
                                    prepare='get_entity', 
                                    dest='ner_success', conditions='is_ner_success')
        self.machine.add_transition(trigger='proceed', source='start', 
                                    prepare='get_entity', 
                                    dest='ner_fail', unless='is_ner_success', 
                                    after='end')
        
        self.machine.add_transition(trigger='proceed', source='ner_success', 
                                    prepare='get_predicate', 
                                    dest='predicate_success', conditions='is_predicate_success')
        self.machine.add_transition(trigger='proceed', source='ner_success', 
                                    prepare='get_predicate', 
                                    dest='predicate_fail', unless='is_predicate_success', 
                                    after='end')
        
        self.machine.add_transition(trigger='proceed', source='predicate_success', 
                                    prepare='get_type', 
                                    dest='type_success', conditions='is_type_success')
        self.machine.add_transition(trigger='proceed', source='predicate_success', 
                                    prepare='get_type', 
                                    dest='type_fail', unless='is_type_success', 
                                    after='end')

        self.machine.add_transition(trigger='proceed', source='type_success', 
                                    prepare='get_answer', 
                                    dest='query_success', conditions='is_query_success', after='respond')
        self.machine.add_transition(trigger='proceed', source='type_success', 
                                    prepare='get_answer', 
                                    dest='query_fail', unless='is_query_success', after='end')
        
        self.machine.add_transition(trigger='reset', source='*', prepare=['empty'], dest='start')

    # CHECKS
    def is_ner_success(self, event):
        return self.entity is not None

    def is_predicate_success(self, event):
        return self.predicate[0] is not None and self.predicate[1] >= 0.7

    def is_type_success(self, event):
        return self.type[0] is not None and self.type[1] >= 0.7

    def is_query_success(self, event):
        return self.answer is not None

    def end(self, event):
        print('Failed to find an answer. Please, paraphrase your question.')

    # QUESTION PROCESSING
    def get_entity(self, event):
        nlp = spacy.load("en_core_web_sm")
        nlp.add_pipe("dbpedia_spotlight", config={"verify_ssl": False})
    
        doc = self.ner_model(event.kwargs.get('question'))
        if not doc.ents:
            self.entity = None
        else:
            self.entity = doc.ents[-1].kb_id_

    def get_predicate(self, event):
        tokenized_q = self.tokenizer(event.kwargs.get('question'), padding="max_length", truncation=True, return_tensors='pt')
        
        with torch.no_grad():
            logits = self.predicate_classifier(**tokenized_q).logits.softmax(-1)
            pred = logits.argmax(-1).item()
            
            confidence = logits.max(-1).values.item()
            predicate = self.predicate_classifier.config.id2label[pred]

        self.predicate = (predicate, confidence)

    def get_type(self, event):
        tokenized_q = self.tokenizer(event.kwargs.get('question'), padding="max_length", truncation=True, return_tensors='pt')['input_ids']
        
        with torch.no_grad():
            logits = self.type_classifier(tokenized_q).softmax(-1)
            pred = logits.argmax(-1).item()
            
            confidence = logits.max(-1).values.item()
            predicate = self.type_classifier.id2label[pred]

        self.type = (predicate, confidence)
    
    def get_answer(self, event):
        query = self.query_templates[self.type[0]].format(predicate=self.predicate[0], entity=self.entity)
        
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)
        
        results = self.sparql.query().convert()
        if not results['results']['bindings']:
            self.answer = None
        else:
            answer_data = results['results']['bindings']
            answer_data = list(map(lambda a: list(a.values())[0]['value'], answer_data))
            answer_data = list(map(lambda a: a[a.rfind('/')+1:], answer_data))
    
            self.answer = answer_data
    
    # CHATBOT FUNCTIONALITY
    def empty(self, event):
        self.entity = None
        self.predicate = None
        self.type = None
        self.answer = None

    def respond(self, event):
        print(f'Responding')
        prompt = f"""
        Answer concisely using ONLY these facts. 
        Talk as if you were asked this question directly and you just use the facts in this propt as if you know them yourself.
        Question: {event.kwargs.get('question')}
        Facts: {self.answer[:10]}
        Answer: """
        response = ollama.generate(model="mistral", 
                                   prompt=prompt)
        print(response["response"].strip())

        return self.answer

    def ask(self, question):
        while self.state not in ['ner_fail', 'predicate_fail', 'type_fail', 'query_fail', 'query_success']:
            self.proceed(question=question)
        print(f'Question info: {self.entity, self.predicate, self.type}')
        self.reset()

In [182]:
agent = QnAAgent()

In [184]:
questions = processed_train.Query.sample(5).values

for ind, q in enumerate(questions):
    print(f'Q{ind + 1} - {q}')
    agent.ask(q)
    print(f'\n')

Q1 - what kind of music does stock aitken waterman play
Responding
Stock Aitken Waterman primarily produce Dance-pop, Eurobeat, Hi-NRG, and Pop music.
Question info: ('http://dbpedia.org/resource/Stock_Aitken_Waterman', ('http://dbpedia.org/ontology/genre', 0.9990726709365845), ('forward', 0.9995988011360168))


Q2 - which city was tommy neilson born in
Responding
Tommy Neilson was born in Gorebridge.
Question info: ('http://dbpedia.org/resource/Tommy_Neilson', ('http://dbpedia.org/ontology/birthPlace', 0.9774978756904602), ('forward', 0.9987467527389526))


Q3 - which animated film did jerry rees directed?
Responding
Jerry Rees directed "The Brave Little Toaster."
Question info: ('http://dbpedia.org/resource/Jerry_Rees', ('http://dbpedia.org/ontology/director', 0.9869979619979858), ('backward', 0.9947677850723267))


Q4 - which fictional character was created by dan aykroyd
Failed to find an answer. Please, paraphrase your question.
Question info: ('http://dbpedia.org/resource/Dan_(ra

Some entites appear to have no relevant information to answer the question, which is the primary reason for failures to find an answer.