In [None]:
import pandas as pd 
import numpy as np
import re
from collections import defaultdict
import spacy
import json

In [None]:
book_df = pd.read_csv('data_outputs/book_df.csv')
book_df.head()

In [None]:
id_list = book_df['ID'].to_list()


In [None]:
train_df = pd.read_csv('data/SimpleDBpediaQA/train.txt',sep='\t', header= None)
train_df.columns = ['id','Question', 'dbpedia_resource', 'dbpedia_predicate', 'direction', 'dbpedia_transl', 'freebase', 'entity']
test_df = pd.read_csv('data/SimpleDBpediaQA/test.txt',sep='\t', header= None)
test_df.columns = ['id','Question', 'dbpedia_resource', 'dbpedia_predicate', 'direction', 'dbpedia_transl', 'freebase', 'entity']
valid_df = pd.read_csv('data/SimpleDBpediaQA/valid.txt',sep='\t', header= None)
valid_df.columns = ['id','Question', 'dbpedia_resource', 'dbpedia_predicate', 'direction', 'dbpedia_transl', 'freebase', 'entity']

In [None]:
full_df = pd.concat([train_df, test_df, valid_df])
full_df.head()

In [None]:
df = book_df.merge(full_df[['id','entity']], how = 'left', left_on='ID', right_on='id')
df = df.drop_duplicates('ID')

In [None]:
genre_df = df[df['FreebasePredicate']=='books_in_genre']
author_df = df[df['FreebasePredicate']=='book_from_author']
book_ent_df = df[df['FreebasePredicate']=='author_of_book']

In [None]:
genre_df

In [None]:
def create_tuple(entities, questions, ids):
    #genre_df['entity'].str.split(), genre_df['Query'].str.split(), genre_df['ID']
    question_tuple = []
    result = dict()
    for token, q, id_ in zip(entities, questions, ids):
        for t, question in zip(token, q):
            question = re.sub(r'[^\w\s]','',question.lower())
            if t == 'I':
                question_tuple.append((id_, question))
    return question_tuple

In [None]:
genre_tuple = create_tuple(genre_df['entity'].str.split(), genre_df['Query'].str.split(), genre_df['ID'])
book_tuple = create_tuple(book_ent_df['entity'].str.split(), book_ent_df['Query'].str.split(), book_ent_df['ID'])
author_tuple = create_tuple(author_df['entity'].str.split(), author_df['Query'].str.split(), author_df['ID'])

In [None]:
def create_dict(tuples):
    d = defaultdict(list)
    for k, v in tuples:
        d[k].append(v)
    for k, v in d.items():
        d[k] = (' '. join(map(str, v)))
    return d

In [None]:
genre_dict = create_dict(genre_tuple)
book_dict = create_dict(book_tuple)
author_dict = create_dict(author_tuple)

In [None]:
def create_train(dict_, questions, ids, label):
    ent_list =[]
    ent_dict = defaultdict()
    train = []
    for question, id_ in zip(questions, ids):
        question = re.sub(r'[^\w\s]','',question.lower())
        ent = str(dict_[id_])
        if ent != '[]':   
            ent_words = re.search(ent, question)
            ent_list.append((question, (ent_words.start(), ent_words.end(), label)))  

    for v in ent_list:
        ent_dict["entities"] = v
        
    for question, entities in ent_list:
        entities = {"entities": [entities]}
        train.append((question, entities))
    return train
    

In [None]:
genre_entities = create_train(genre_dict, genre_df['Query'], genre_df['ID'], 'GENRE')
book_entities = create_train(book_dict, book_ent_df['Query'], book_ent_df['ID'], 'BOOK')
author_entities = create_train(author_dict, author_df['Query'], author_df['ID'], 'PERSON')

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
#load question generated from abstracts
import json

with open('/home/aliciescont/Documents/tfm_code/question_generation/abstract.json') as f:
    abstract = json.load(f)
question_list = []

for qg in abstract.values():
    for question in qg:
        question_list.append(question)
       

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
#add date, lang, loc to avoid catastrophic forgetting
date_entities = []
lang_entities = []
loc_entities = []
for question in question_list:
    question = re.sub(r'[^\w\s]',' ',question.lower())
    print(question)
    if question.isalpha() == True:
        doc = nlp(question)
    
        for ent in doc.ents:
            if ent.label_ == 'LANGUAGE':
                lang_entities.append((question, {'entities' : [ent.text, ent.start, ent.end, ent.label_]}))
            

In [None]:
lang_entities

In [None]:
train_data =  author_entities +  genre_entities + book_entities + lang_entities 

# Custom NER Spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
import random
from spacy.util import minibatch, compounding
from pathlib import Path
import warnings

In [None]:
nlp.pipe_names

In [None]:
ner = nlp.get_pipe("ner")
entity_labels = ['GENRE', 'BOOK']
for label in entity_labels:
    ner.add_label(label)
#disable component not needed
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
optimizer = nlp.resume_training()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    for itn in range(50):
        random.shuffle(train_data)
        batches = minibatch(train_data, size=sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        print("Losses", losses)


In [None]:
doc = nlp("what are some science fiction")
for ent in doc.ents:
    print(ent.text, ent.label_)


In [None]:
doc = nlp("who wrote 1984 ")
for ent in doc.ents:
    print(ent.text, ent.label_)


In [None]:
doc = nlp("what books have stephen king written ")
for ent in doc.ents:
    print(ent.text, ent.label_)


In [None]:
output_dir = Path('/home/aliciescont/Documents/tfm_code/QA_eval/SimpleDBpediaQA/V1')
nlp.to_disk(output_dir)

In [None]:
nlp_updated = spacy.load(output_dir)
doc= nlp_updated("who wrote a book published in english")

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
move_names = list(ner.move_names)
  
assert nlp_updated.get_pipe("ner").move_names == move_names

In [None]:
  if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = 'new_model_name'  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)


In [None]:

nlp2 = spacy.load(output_dir)
        
assert nlp2.get_pipe("ner").move_names == move_names
        

In [None]:
doc = nlp("who is the author of 1984")
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
doc = nlp("what is book written in english")
for ent in doc.ents:
    print(ent.text, ent.label_)

https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/