In [None]:
import nltk
from nltk import word_tokenize
from spacy import displacy
from collections import Counter
import spacy

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
question1 = "did Bill Clinton or Kim Jong Un invade iraq?"
question2 = "when did he graduate from law school?"

## Method 1: Spacy POS

### Initial Question

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def spacy_pos_tag(text):
  result = []
  doc = nlp(text)
  for token in doc:
    result.append((token.text, token.tag_))
  return result

In [None]:
def get_tags(pos_tagged, input_tags):
  nouns = []
  prev_flag = False
  current_noun = None

  for word, tag in pos_tagged:
    if tag in input_tags and prev_flag is False: # new noun
      current_noun = [word]
      prev_flag = True
    elif tag in input_tags and prev_flag is True: # part of same noun
      current_noun.append(word)
      prev_flag = True
    else: # tag is not NN. append current noun to result (if any)
      if current_noun is not None:
        if len(current_noun) == 1:
          nouns.extend(current_noun)
        else:
          nouns.append(' '.join(current_noun))
        current_noun = None
        prev_flag = False
  return nouns

In [None]:
question1_pos_tagged = spacy_pos_tag(question1)
question1_pos_tagged

[('did', 'VBD'),
 ('Bill', 'NNP'),
 ('Clinton', 'NNP'),
 ('or', 'CC'),
 ('Kim', 'NNP'),
 ('Jong', 'NNP'),
 ('Un', 'NNP'),
 ('invade', 'VB'),
 ('iraq', 'NNP'),
 ('?', '.')]

In [None]:
question1_nouns = get_tags(question1_pos_tagged, ('NN', 'NNS', 'NNP'))
question1_nouns

['Bill Clinton', 'Kim Jong Un', 'iraq']

### Follow-Up Question

In [None]:
def replace_pronouns(pos_tagged, prev_qn_nouns): # assumption: pronoun(s) in q2 refer to just one noun in q1
  result = []
  this_pronoun_list = []
  words = []

  for word, tag in pos_tagged:
    if tag in ('PRP', 'PRP$'):
      this_pronoun_list.append(word)
    words.append(word)

  for noun in prev_qn_nouns:
    this_token_list = []
    for word in words:
      if word in this_pronoun_list:
        this_token_list.append(noun)
      else:
        this_token_list.append(word)
    result.append(' '.join(this_token_list))

  return result

In [None]:
question2_pos_tagged = spacy_pos_tag(question2)
question2_pos_tagged

[('when', 'WRB'),
 ('did', 'VBD'),
 ('he', 'PRP'),
 ('graduate', 'VB'),
 ('from', 'IN'),
 ('law', 'NN'),
 ('school', 'NN'),
 ('?', '.')]

In [None]:
print(question1)
print(question2)

did Bill Clinton or Kim Jong Un invade iraq?
when did he graduate from law school?


In [None]:
replaced = replace_pronouns(question2_pos_tagged, question1_nouns)
for i in replaced:
  print(i)

when did Bill Clinton graduate from law school ?
when did Kim Jong Un graduate from law school ?
when did iraq graduate from law school ?


## Method 2: NLTK POS

In [None]:
def nltk_pos_tag(text):
  tokenized = word_tokenize(text)
  return nltk.pos_tag(tokenized)

In [None]:
nltk_pos_tag(question1)

[('did', 'VBD'),
 ('bill', 'NN'),
 ('clinton', 'NN'),
 ('or', 'CC'),
 ('kim', 'VB'),
 ('jung', 'NN'),
 ('un', 'JJ'),
 ('invade', 'NN'),
 ('iraq', 'NN'),
 ('?', '.')]

In [None]:
get_tags(nltk_pos_tag(question1), ('NN', 'NNS', 'NNP')) # wrong result for 'jung' and 'invade iraq'

['bill clinton', 'jung', 'invade iraq']

## Method 3: NeuralCoref (requires spacy 2.1.0)

In [None]:
!pip install spacy==2.1.0 
!pip install pip install neuralcoref
import neuralcoref
!python -m spacy download en_core_web_sm

nlp_neuralcoref = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp_neuralcoref)

<spacy.lang.en.English at 0x7ffa819b4890>


### Successful Example (1 person in Q1, simple sentence structure in Q2)

In [None]:
question1_neuralcoref = "did bill clinton invade iraq?"
question2_neuralcoref = "when did he get married to hillary clinton"
questions_concatenated_neuralcoref = question1_neuralcoref + ' ' + question2_neuralcoref
questions_concatenated_neuralcoref

'did bill clinton invade iraq? when did he get married to hillary clinton'

In [None]:
doc_neuralcoref = nlp(questions_concatenated_neuralcoref)
resolved_text = doc_neuralcoref._.coref_resolved
resolved_text

did bill clinton invade iraq? when did bill clinton get married to hillary clinton


### Failed Example (2 people in Q1, more complex sentence structure in Q2)

In [None]:
question1_neuralcoref = "did bill clinton or kim jung un invade iraq?"
question2_neuralcoref = "when did he and hillary clinton get married"
questions_concatenated_neuralcoref = question1_neuralcoref + ' ' + question2_neuralcoref
questions_concatenated_neuralcoref

'did bill clinton or kim jung un invade iraq? when did he and hillary clinton get married'

In [None]:
doc_neuralcoref = nlp(questions_concatenated_neuralcoref)
resolved_text = doc_neuralcoref._.coref_resolved
resolved_text

did bill clinton or kim jung un invade iraq? when did bill clinton or kim jung get married


## Method 4: NER

In [None]:
def spacy_ner_tag(text):
  result = []
  doc = nlp(text)
  for i in doc:
    result.append((i.text, i.ent_type_, i.ent_iob_))
  return result

In [None]:
spacy_ner_tag('hello my name is brian')

[('hello', '', 'O'),
 ('my', '', 'O'),
 ('name', '', 'O'),
 ('is', '', 'O'),
 ('brian', 'PERSON', 'B')]

In [None]:
def get_per_ners(ner_tagged):
  persons = []
  prev_flag = False
  current_per = None

  for word, tag, bio in ner_tagged:
    if tag == 'PERSON' and bio == 'B': # new person
      current_per = [word]
    elif tag == 'PERSON' and bio in ('I', 'O'): # part of same person
      current_per.append(word)
    else: # tag is not PER. append current person to result (if any)
      if current_per is not None:
        if len(current_per) == 1:
          persons.extend(current_per)
        else:
          persons.append(' '.join(current_per))
        current_per = None
  if current_per:
    persons.extend(current_per)
  return persons

In [None]:
get_per_ners(spacy_ner_tag(question1))

['Bill Clinton', 'Kim Jong Un']

In [None]:
spacy_ner_tag(question1)

[('did', '', 'O'),
 ('Bill', 'PERSON', 'B'),
 ('Clinton', 'PERSON', 'I'),
 ('or', '', 'O'),
 ('Kim', 'PERSON', 'B'),
 ('Jong', 'PERSON', 'I'),
 ('Un', 'PERSON', 'I'),
 ('invade', '', 'O'),
 ('iraq', 'GPE', 'B'),
 ('?', '', 'O')]

## Final QR Code for Telegram Bot Integration (POS Tagging + NER)

In [1]:
def spacy_pos_tag(text):
    result = []
    doc = nlp(text)
    for token in doc:
        result.append((token.text, token.tag_))
    return result

def spacy_ner_tag(text):
    result = []
    doc = nlp(text)
    for i in doc:
        result.append((i.text, i.ent_type_, i.ent_iob_))
    return result

def get_per_ners(ner_tagged):
  persons = []
  prev_flag = False
  current_per = None

  for word, tag, bio in ner_tagged:
    if tag == 'PERSON' and bio == 'B': # new person
      current_per = [word]
    elif tag == 'PERSON' and bio in ('I', 'O'): # part of same person
      current_per.append(word)
    else: # tag is not PER. append current person to result (if any)
      if current_per is not None:
        if len(current_per) == 1:
          persons.extend(current_per)
        else:
          persons.append(' '.join(current_per))
        current_per = None
  if current_per:
    persons.extend(current_per)
  return persons

def replace_pronouns(pos_tagged, prev_persons):
  result = []
  this_pronoun_list = []
  words = []

  for word, tag in pos_tagged:
    if tag in ('PRP', 'PRP$'):
      this_pronoun_list.append(word)
    words.append(word)

  for noun in prev_persons:
    this_token_list = []
    for word in words:
      if word in this_pronoun_list:
        this_token_list.append(noun)
      else:
        this_token_list.append(word)
    result.append(re.sub('\s+[?]', '?', ' '.join(this_token_list)))

  return result

# Following function is part of telebot.py, and requires Telegram bot handlers and functions to run

def handle_followup_question(update, context):
    question = ' '.join(context.args).strip()
    # perform query reformulation
    user_id = update.effective_user.id
    prev_question = db.get_question(user_id)[0]
    print(f"prev_qn: {prev_question}")
    if prev_question:
        this_pos_tagged = spacy_pos_tag(question)
        prev_ner_tagged = spacy_ner_tag(prev_question)
        prev_persons = get_per_ners(prev_ner_tagged)
        candidate_questions = replace_pronouns(this_pos_tagged, prev_persons)
        if candidate_questions:
            question = candidate_questions[0]
    else:
        context.bot.send_message(chat_id=update.effective_chat.id, text="You did not ask any previous questions")
        return ConversationHandler.END
    response = "<b>You asked me:</b> " + question
    context.bot.send_message(chat_id=update.effective_chat.id, text=response, parse_mode=ParseMode.HTML)
    # perform IR
    retrieved_document = bm25.get_top_n(question.lower().split(), bm25_corpus, n=1)[0]
    ans = qa(retrieved_document, question)
    # print(ans)
    if not ans:
        no_response = "I do not know"
        context.bot.send_message(chat_id=update.effective_chat.id, text=no_response)
    else:
        answer = ans[0].split('\n')[0].strip().strip(punctuation)
        response = f"<b>Answer:</b> {answer}"
        context.bot.send_message(chat_id=update.effective_chat.id, text=response, parse_mode=ParseMode.HTML)
    return ConversationHandler.END