# Generate The Sentence Embeddings and Do Tests Here
Do this only once

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/ATAI/project
! mkdir processed
%cd processed

/content/drive/MyDrive/ATAI/project/processed


In [None]:
!pip install rdflib
!pip install networkx

In [3]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import pandas as pd
import rdflib
from collections import defaultdict, Counter
import locale
_ = locale.setlocale(locale.LC_ALL, '')
from _plotly_future_ import v4_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import numpy as np
import os, random
from tqdm import tqdm
from joblib import dump, load
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import strip_accents_ascii
import pickle

from string import ascii_letters
from urllib.parse import urlparse
from difflib import SequenceMatcher 

only_letters = set(ascii_letters + ' ')


def normalize_text(text):
  t = text.replace(" – ", " - ")
  t = strip_accents_ascii(t.lower())
  t = t.replace('?', ' ')
  t = t.replace('.', ' ')
  t = t.replace(',', ' ')
  t = t.replace('\n', ' ')
  t = ' '.join(t.split())
  return ' ' + t + ' '

In [4]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')

%cd /content/drive/MyDrive/ATAI/project

# some very useful relations
label_pred = rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label')
imdb_id = WDT.P345
image = WDT.P18
genre_pred = WDT.P136
type_pred = WDT.P31
movie_obj = WD.Q11424

/content/drive/MyDrive/ATAI/project


In [None]:
# save the graph in pickle - so faster to load
%cd /content/drive/MyDrive/ATAI/project
graph = rdflib.Graph()
graph.parse('./14_graph.nt', format='turtle')

In [None]:
import pickle
with open('graph.pkl', 'wb') as file: 
    # A new file will be created
    pickle.dump(graph, file)

In [6]:
with open('graph.pkl', 'rb') as file: 
    # Call load method to deserialze
    graph = pickle.load(file)

In [None]:
entities = set(graph.subjects()) | {s for s in graph.objects() if isinstance(s, URIRef)}
predicates = set(graph.predicates())
literals = {s for s in graph.objects() if isinstance(s, Literal)}
with_type = set(graph.subjects(WDT['P31'], None))
with_super = set(graph.subjects(WDT['P279'], None))
types = set(graph.objects(None, WDT['P31']))
supers = set(graph.objects(None, WDT['P279']))
with_label = set(graph.subjects(RDFS.label, None))

n_ents = len(entities)
n_rels = len(predicates)
n_lits = len(literals)
t_tot = len(graph)
t_ent = len([1 for s,p,o in graph.triples((None, None, None)) if isinstance(o, URIRef)])
t_lit = t_tot - t_ent
n_notype = len(entities - with_type - with_super)
n_notype_flt = len(entities - with_type - with_super - types - supers)

pd.DataFrame([
    ('number of entities', f'{n_ents:n}'),
    ('number of literals', f'{n_lits:n}'),
    ('number of predicates', f'{n_rels:n}'),
    ('number of triples', f'{t_tot:n}'),
    ('number of ent-ent triples', f'{t_ent:n}'),
    ('number of ent-lit triples', f'{t_lit:n}'),
    ('number of entities w/o label', f'{len(entities - with_label):n}'),
    ('number of predicates w/o label', f'{len(predicates - with_label):n}'),
    ('number of entities w/o type', f'{n_notype:n}'),
    ('number of instances w/o type', f'{n_notype_flt:n}'),
    ])

In [None]:
# process predicates and entities data

%cd processed
entities_no_label = entities - with_label
pred_without_label = predicates - with_label

predicates_dict = {}
# col1: predicate URL, col2: label

for id, pred in enumerate(predicates-pred_without_label):
  generator = graph.objects(pred, RDFS.label)
  name = normalize_text(str(list(generator)[0]))
  if name not in predicates_dict:
    predicates_dict[name] = [pred] 
  else:
    predicates_dict[name].append(pred)

predicates_dict['tag'] = rdflib.term.URIRef('http://ddis.ch/atai/tag')
predicates_dict['rating'] = rdflib.term.URIRef('http://ddis.ch/atai/rating')

entities_dict = {}
# col1: predicate URL, col2: label

# ignore no label ones for now (they actually do not have names)
for id, ent in tqdm(enumerate(entities-entities_no_label)):
  generator = graph.objects(ent, RDFS.label)
  name = normalize_text(str(list(generator)[0]))
  if name not in entities_dict:
    entities_dict[name] = [ent]
  else:
    entities_dict[name].append(ent)

In [None]:
dump(predicates_dict, 'predicates_dict.joblib') 
dump(entities_dict, 'entities_dict.joblib')

['entities_dict.joblib']

In [None]:
! pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer

# sentence transformer model
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# get sentence embeddings for all predicates and entities and save them

import numpy as np
from joblib import dump, load

%cd /content/drive/MyDrive/ATAI/project/processed
predicates_dict = load('predicates_dict.joblib')
entities_dict = load('entities_dict.joblib')

sentences = list(entities_dict.keys())
embeddings = sentence_model.encode(sentences)
embeddings = np.array(embeddings)
np.save('entities_embeddings.npy', embeddings)

sentences = list(predicates_dict.keys())
embeddings = sentence_model.encode(sentences)
embeddings = np.array(embeddings)
np.save('predicates_embeddings.npy', embeddings)

/content/drive/MyDrive/ATAI/project/processed


In [None]:
# save images_dict as pickle too
with open('/content/drive/MyDrive/ATAI/project/movienet/images.json', 'r') as f:
  images_dict = json.load(f)

import pickle
with open('images.pkl', 'wb') as file: 
    # A new file will be created
    pickle.dump(images_dict, file)

# LOAD EVERYTHING NOW

In [None]:
with open('graph.pkl', 'rb') as file: 
  # Call load method to deserialze
  graph = pickle.load(file)

# load the dictionaries
%cd /content/drive/MyDrive/ATAI/project/processed
predicates_dict = load('predicates_dict.joblib')
entities_dict = load('entities_dict.joblib')

predicates_names = list(predicates_dict.keys())
entities_names = list(entities_dict.keys())

ent2lbl = {}
for key, value in list(entities_dict.items()):
  for val in value:
    ent2lbl[val] = key

pred2lbl = {}
for key, value in list(predicates_dict.items()):
  for val in value:
    pred2lbl[val] = key

# for graph embeddings:
%cd /content/drive/MyDrive/ATAI/project
with open('./ddis-graph-embeddings/entity_ids.del', 'r') as ifile:
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('./ddis-graph-embeddings/relation_ids.del', 'r') as ifile:
    pred2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2pred = {v: k for k, v in pred2id.items()}

# load the sentence embeddings
%cd /content/drive/MyDrive/ATAI/project
entities_graph_embeddings = np.load('./ddis-graph-embeddings/entity_embeds.npy')
predicates_graph_embeddings = np.load('./ddis-graph-embeddings/relation_embeds.npy')

%cd processed
entities_embeddings = np.load('entities_embeddings.npy')
predicates_embeddings = np.load('predicates_embeddings.npy')

with open('images.pkl', 'rb') as file: 
  # Call load method to deserialze
  images_dict = pickle.load(file)

greet_emb = sentence_model.encode('Hello, how are you?')

In [None]:
# No need for this now!

! pip install transformers

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [11]:
# backup option
import requests

API_URL = "https://api-inference.huggingface.co/models/satvikag/chatbot"
headers = {"Authorization": f"Bearer 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"}

def query(q):    
    response = requests.post(API_URL, headers=headers, json={"inputs": {"text": q}}).json()
    if 'generated_text' in response:
        return response['generated_text']
    else:
        return 'Sorry no reply'

# Testing the Functions

In [12]:
test_questions = ['Who is the director of Good Will Hunting?', 
                  'Who directed The Bridge on the River Kwai?',
                  'Who is the director of Star Wars: Episode VI - Return of the Jedi?',
                  'Who is the screenwriter of The Masked Gang: Cyprus?',
                  'What is the MPAA film rating of Weathering with You?',
                  'What is the genre of Good Neighbors?',
                  'Show me a picture of Halle Berry.',
                  'What does Julia Roberts look like?',
                  'Let me know what Sandra Bullock looks like.',
                  'Recommend movies similar to Hamlet and Othello.',
                  'Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?',
                  'Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.',
                  'What is the box office of The Princess and the Frog?',
                  'Can you tell me the publication date of Tom Meets Zizou?',
                  'Who is the executive producer of X-Men: First Class?',
                  'Who is Top Gun: Maverick\'s screenwriter?']


In [40]:
# the same functions as in the query/__init__.py for testing

templates = {'KB':['The answer found in the graph is: ', 'According to the knowledge graph: ', 'I think the answer is '], 
             'emb':['The answer found using embeddings is: ', 'The answer suggested by embeddings: ',
                    'Some other answers found using the embeddings: '],
             'no':['Sorry, I could not find the information you are loking for. Can you paraphrase your question?', 
                   'The information could not be found.', 'Sorry, can you rephrase your question?'],
             'suggest':['Some %s similar to those are: '], # !!!!!!!!!!!!!!!!
             'image':['Here is a %s image from the movie %s (%s):\n%s', 
                      'Here is an image of %s from the movie %s (%s):\n%s',
                      'Here is an image of %s:\n%s'],
             'greet':['Hey!', 'Hello', 'Hi :)'],
             'recommend': [('You can check out the movies ', ' with the genres ', ' which I have found using the graph embeddings.\n'),
                           'You can check out the movie %s found using the %s. Its genre is %s.',
                           'You might be interested in this movie: %s that I found in the %s, with the genre %s.',
                           'Here is the imdb page of the movie: imdb:\s',
                            'See the imdb page here: imdb:%s.',
                            '(imdb:%s)']} # or use anoher nlp model?
sentence_types = ['full', 'only_obj']

# ************************************************ANSWER*************************************************
def answer(q, k=3):
  triples, embeddings_triples = extract(q)
  if triples == None:
      return query(q)
  if type(triples) == str:
    return triples
  ans = ''
  use_other = False
  if len(triples) > 0:
    ans += templates['KB'][random.randint(0,1)] + construct_answer(triples, type=sentence_types[random.randint(0,1)]) + '.\n'
    use_other = True
    for triple in triples:
      for emb_triple in embeddings_triples:
        if triple[:2] == emb_triple[:2]:
          emb_triple[2] = list(set(emb_triple[2]) - set(triple[2]))

  if len(embeddings_triples) > 0:
    if use_other:
      ans += templates['emb'][2] + construct_answer(embeddings_triples, type=sentence_types[1]) + '.\n'
    else:
      ans += templates['emb'][random.randint(0,1)] + construct_answer(embeddings_triples, type=sentence_types[random.randint(0,1)]) + '.\n'
  if len(triples) <= 0 and len(embeddings_triples) <= 0:
    # use a fun model!
    return query(q)
    # ans = templates['no'][random.randint(0,2)]
  return ans

# type can be full, only_obj ...
def construct_answer(triples, type='full'):
  def mult_obj(objects):
    ans = ''
    for obj in objects[:-1]:
      ans += ' %s,'%obj
    ans = ans[:-1]
    ans += ' and %s'%objects[-1]
    return ans

  ans = ''
  if type=='full':
    for triple in triples:
      objects = triple[2]
      mult = ''
      if len(objects) > 1:
        mult = 's'
      ans = 'The %s%s of %s '%(triple[1], mult, triple[0])
      if len(objects) == 1:
        ans += 'is %s'%objects[0]
      else:
        ans += 'are'
        ans += mult_obj(objects)
    return ans
  elif type=='only_obj':
    triple = triples[0]
    objects = triple[2]
    mult = ''
    if len(objects) == 1:
      return objects[0]
    else:
      return mult_obj(objects)
  else:
    triple = triples[0]
    objects = triple[2]
    mult = ''
    if len(objects) == 1:
      return objects[0]
    else:
      return mult_obj(objects) 

def get_imdb_id(ent):
  for s, p, o in graph.triples((ent, imdb_id, None)):
    return str(o)
  return None

# all these should be strings
def get_image_from_imdb_id(movie=None, im_type=None, cast=None, only_person=True):
  ims = images_dict.copy()
  if movie != None:
    ims = [d for d in ims if movie in d['movie']]
  if im_type != None:
    ims = [d for d in ims if d['type'] == im_type]
  if cast != None:
    if only_person:
      ims = [d for d in ims if cast in d['cast'] and len(d['cast']) == 1]
    else:
      ims = [d for d in ims if cast in d['cast'] and len(d['cast']) > 1]
  return ims

def answer_image(ent):
  id = get_imdb_id(ent)
  if id != None and id.startswith('tt'):  # we are looking for a movie img
    ims = get_image_from_imdb_id(movie=id)
    if len(ims) <= 0:
      return None, None
    im = ims[random.randint(0,len(ims)-1)]
    im_type = im['type']
    return templates['image'][0]%(im_type, ent2lbl[ent].title(), 
                        'wd:' + urlparse(str(ent)).path.split('/')[-1], 'image:' + im['img'][:-4])
  if id != None and id.startswith('nm'):
    ims = get_image_from_imdb_id(cast=id)
    if len(ims) <= 0:
      return None, None
    im = ims[random.randint(0,len(ims)-1)]
    if len(im['movie']) > 0:
      i = 0
      movie_ent = None
      while movie_ent == None and i < len(im['movie']):
        id_lit = rdflib.term.Literal(im['movie'][i], datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))
        for s, p, o in graph.triples((None, imdb_id, id_lit)):
          movie_ent = s
        i += 1
      if movie_ent == None:
        return templates['image'][2]%(ent2lbl[ent], 'image:' + im['img'][:-4])
      return templates['image'][1]%(ent2lbl[ent].title(), ent2lbl[movie_ent].title(), 
                                    'wd:' + urlparse(str(movie_ent)).path.split('/')[-1], 'image:' + im['img'][:-4])
    else:
      return templates['image'][2]%(ent2lbl[ent], 'image:' + im['img'][:-4])
  else:
    return None

def answer_recommend(entities):
  graph_emb_obj = []
  sentence_emb_obj = []
  ans = 'Okay, so '
  genres_init = []
  names_init = []
  for ent in entities:
    genre = None
    names_init.append(ent2lbl[ent])
    for s, p, o in graph.triples((ent, genre_pred, None)):
      genre = ent2lbl[o][1:-1]
      genres_init.append(genre)

  # First using graph embeddings
  embeddings = [entities_graph_embeddings[ent2id[ent]] for ent in entities]
  lhs = np.mean(embeddings, axis=0)
  obj_ids, _ = k_neighbors(lhs, entities_graph_embeddings, k=5)
  for i in obj_ids:
    obj = id2ent[i]
    if ent2lbl[obj] not in names_init:
      genre = None
      for s, p, o in graph.triples((obj, genre_pred, None)):
          genre = ent2lbl[o][1:-1]
      for s, p, o in graph.triples((obj, type_pred, None)):
          if o == movie_obj and genre in genres_init: # ??????????????????***
            graph_emb_obj.append((ent2lbl[obj][1:-1].title(), genre, get_imdb_id(obj), obj))
        
  if len(graph_emb_obj) == 1:
    ans += 'you can check out the movie %s found using the graph embeddings. \
    Its genre is %s.'%(graph_emb_obj[0][0], graph_emb_obj[0][1])
    id = graph_emb_obj[0][2]
    if id != None and id.startswith('tt'):
      ans += templates['recommend'][random.randint(3,5)]%id
    if str(graph_emb_obj[0][3]) in plots:
      ans += '\nAlso read the plot here: \n %s'%plots[graph_emb_obj[0][3]]
    return ans, ''
  elif len(graph_emb_obj) > 1:
    names = []
    genres = []
    for movie in graph_emb_obj:
      name = movie[0]
      id = movie[2]
      if id != None and id.startswith('tt'):
        name += ' (imdb:%s), '%id
      
      names.append(name)
      genres.append(movie[1])
    ans = templates['recommend'][0][0]
    for name in names:
      ans += name
    ans = ans[:-2] + templates['recommend'][0][1]
    for genre in genres:
      ans += genre + ', '
    ans = ans[:-2] + templates['recommend'][0][2]
    return ans, ''


  # Using Sentence Embeddings
  lbls = [ent2lbl[ent] for ent in entities]
  embeddings = sentence_model.encode(lbls)
  lhs = np.mean(embeddings)
  obj_ids, _ = k_neighbors(lhs, entities_embeddings, k=10)
  for i in obj_ids:
    objects = entities_dict[entities_names[i]]
    found = False
    i = 0
    while not found and i < len(objects):
      obj = objects[i]
      if obj in ent2lbl and ent2lbl[obj] not in names_init:
        genre = None
        for s, p, o in graph.triples((obj, genre_pred, None)):
            genre = ent2lbl[o][1:-1]
        for s, p, o in graph.triples((obj, type_pred, None)):
            if o == movie_obj and genre in genres_init: # ??????????????????***
              sentence_emb_obj.append((ent2lbl[obj].title(), genre, get_imdb_id(obj)))
              found = True
      i += 1
  
  if len(sentence_emb_obj) == 1:
    ans += templates['recommend'][random.randint(1,2)]%(sentence_emb_obj[0][0], 'Knowledge Base', 
                                                        sentence_emb_obj[0][1])
    id = sentence_emb_obj[0][2]
    if id != None and id.startswith('tt'):
      ans += templates['recommend'][random.randint(3,5)]%id
    # if str(graph_emb_obj[0][3]) in plots:
    #  ans += '\nAlso read the plot here: \n %s'%plots[graph_emb_obj[0][3]]
    return ans, ''

  elif len(sentence_emb_obj) > 1:
    names = []
    genres = []
    for movie in graph_emb_obj:
      name = movie[0]
      id = movie[2]
      if id != None and id.startswith('tt'):
        name += ' (imdb:%s), '%id
      
      names.append(name)
      genres.append(movie[1])
    ans = templates['recommend'][0][0]
    for name in names:
      ans += name
    ans = ans[:-2] + templates['recommend'][0][1]
    for genre in genres:
      ans += genre + ', '
    ans = ans[:-2] + templates['recommend'][0][2]
    return ans, ''
  return '', ''

    
# ************************************************EXTRACT****************************************************
def extract(q, k=3):
  ne, pred = get_named_entities_2(q)
  if pred == 'greet':
    return templates['greet'][random.randint(1,3)], ''
  if len(ne) <= 0:
    return query(q), ''

  embeddings_ent = sentence_model.encode(ne)
  triples = []
  embeddings_triples = []
  entities = []
  
  if pred == 'recommend':
    for emb in embeddings_ent:
      ent_ids = k_neighbors(emb, entities_embeddings, k=10)
      ent_id = 0
      found = False
      while not found and ent_id < len(ent_ids[0]):
        i = 0
        ents = entities_dict[entities_names[ent_ids[0][ent_id]]]
        while not found and i < len(ents):
          ent = ents[i]
          for s, p, o in graph.triples((ent, type_pred, None)):
            if o == movie_obj: # ??????????????????***
              entities.append(ent)
              found = True
          i += 1
        ent_id += 1
    if len(entities) > 0:
      return answer_recommend(entities)
    else:
      return 'Sorry, I could not find those movies in the KG. ', ''

  elif pred == image:
    ne_id = 0
    found = False
    while not found and ne_id < len(embeddings_ent):
      emb = embeddings_ent[ne_id]
      ent_ids = k_neighbors(emb, entities_embeddings, k=10)
      ent_id = 0
      while not found and ent_id < len(ent_ids[0]):
        i = 0
        ents = entities_dict[entities_names[ent_ids[0][ent_id]]]
        while not found and i < len(ents):
          ent = ents[i]
          ans = answer_image(ent)
          if ans != None:
            return ans, ''
          i += 1
        ent_id += 1
      ne_id += 1
    return 'Sorry, could not find the movie or actor you are asking for. ', ''
  else:  
    entities = [entities_dict[ent_name][0] for ent_name in ne]
    for ent in entities:
    # check if can use embeddings!
      if pred in pred2id and ent in ent2id:
        triple = get_objects_embeddings(ent, pred)
        if len(triple) > 0 and len(triple[2]) > 0:
          embeddings_triples.append(triple)
      triple = get_objects(ent, pred)
      if len(triple) > 0 and len(triple[2]) > 0:
        triples.append(triple)

  return triples, embeddings_triples

def get_predicate(question):
  # check if they just do small talk
  emb = sentence_model.encode([question])[0]
  n, _ = k_neighbors(emb, np.concatenate([predicates_embeddings, greet_emb.reshape(1,-1)], axis=0), k=1)
  if n == len(predicates_embeddings):
    return 'greet', question

  # first check for look like or looks like
  words = ''.join(l for l in question if l in only_letters)
  if 'look like' in words:
    return image, 'look like'
  if 'looks like' in words:
    return image, 'looks like'

  # then check if recommendation question
  if 'recommend' in words:
    return 'recommend', 'recommend'
  if 'similar' in words:
    return 'recommend', 'similar'

  else:
    # get most probable predicate
    q = question.split()
    embeddings_pred = sentence_model.encode(q)
    pred = (None, 100000, None)
    included = 0
    for i, emb in enumerate(embeddings_pred):
      [p, d] = k_neighbors(emb, predicates_embeddings, 1)
      if predicates_names[p[0]] in question:
        if len(predicates_names[p[0]]) > included:
          included = len(predicates_names[p[0]])
          pred = (p[0], d[0], q[i])
      elif included <= 0:
        if d < pred[1]:
          pred = (p[0], d[0], q[i])
  predicates_uris = [predicates_dict[predicates_names[pred[0]]][0]]
  return predicates_uris[0], pred[2]

def get_objects(entity, pred):
  print(entity, pred)
  objects = []
  predicate = graph.objects(pred, RDFS.label)
  predicate = str(list(predicate)[0])
  for s, p, o in graph.triples((entity, pred, None)):
    if o in ent2lbl:
      objects.append(ent2lbl[o].title())
    elif isinstance(o, Literal):
      objects.append(str(o).title())
    else:
      print('WTF IF HAPPENING IN get_objects ????')
  return [ent2lbl[entity].title(), predicate, objects]

def get_objects_embeddings(entity, pred):
  ent_emb = entities_graph_embeddings[ent2id[entity]]
  pred_emb = predicates_graph_embeddings[pred2id[pred]]
  lhs = ent_emb + pred_emb
  obj_ids, _ = k_neighbors(lhs, entities_graph_embeddings, k=3)
  predicate = graph.objects(pred, RDFS.label)
  predicate = str(list(predicate)[0])
  objects = []
  for i in obj_ids:
    objects.append(ent2lbl[id2ent[i]].title())
  return [ent2lbl[entity].title(), predicate, objects]

def get_named_entities(q):
  ne = []
  if len(ne) == 0:
    txt = q.replace(':', ' ')
    txt = txt.replace('-', ' ')
    if txt[-1] != '?':
      txt += '?'
    ner_results = nlp(txt)
    #' '.join(w[:1].upper() + w[1:] for w in txt.split(' ')))
    name = None
    txt = txt.lower()
    txt = ' '.join(txt.split())
    for entry in ner_results:
      if entry['entity'][0] == 'B':
        if name != None:
          ne.append(name.replace(' ##', ''))
        name = entry['word']
      elif entry['entity'][0] == 'I':
        name += ' ' + entry['word']
    if name != None:
      ne.append(name.replace(' ##', ''))
  return ne, txt

def get_named_entities_2(q):
  ne = []
  text = normalize_text(q)
  pred, pred_word = get_predicate(text)
  if pred == 'greet':
    return [], pred
  text = text.replace(pred_word, '')
  text = normalize_text(text)
  indices = list(np.where([word in text for word in entities_names])[0])
  indices2 = indices.copy()
  for i in indices:
    for j in indices:
      word = entities_names[i]
      sub = entities_names[j]
      if i != j and sub in word and j in indices2:
        indices2.remove(j)
  for i in indices2:
    ne.append(entities_names[i])
  arr1inds = np.array([len(word) for word in ne]).argsort()
  ne = np.array(ne)[arr1inds[::-1]]

  if pred == 'recommend':
    return ne, pred
  return ne[:1], pred

# similarity function
def k_neighbors(query, embeddings, k=1):
  N = embeddings.shape[0]
  d = embeddings.shape[1]

  # compute distances
  distances = np.linalg.norm(embeddings - query, axis = 1)
  # select indices of vectors having the lowest distances from the query vector (sorted!)
  neighbors = np.argpartition(distances, range(0, k))[:k]
  return [neighbors, distances[neighbors]]

def matchsubstring(m,n): 
   seqMatch = SequenceMatcher(None,m,n) 
   match = seqMatch.find_longest_match(0, len(m), 0, len(n)) 
   print(m + ', ' + n + ', ' + m[match.a:match.a + match.size])
   return m[match.a:match.a + match.size]

# word is the words in the entities_names, text is the question
def match_words(word, text):
  words = word.split()
  text_words = text.split()
  count = 0
  for w in words:
    if w not in stop_words and w in text_words:
      count += 1
    if w not in text_words:
      return 0
  return count


In [42]:
i = 7
q = 'hello!'
answer(q)

'Hi :)'