In [238]:
import pandas as pd
import numpy as np
import nltk
import re
import spacy
from nltk.corpus import stopwords  
from nltk.corpus import wordnet
import string
from nltk.stem import WordNetLemmatizer
from itertools import chain


# Algorithm to find semantic context

In [246]:
trigger_words = ["what","where","who","how","which","name",
                 "describe","list","identify","if","whats","why","did"]

def find_context(q):
    if '?' not in q:
        q  = q + '?'
    q = re.sub('[^a-zA-Z0-9\?]',' ', q.lower())
    qdict = {}
    for word in trigger_words:
        if word in set(q.split(' ')):
            context = re.findall(r"{}[^?]+\?".format(word),q)
            qdict[word] = context
    return qdict


In [74]:
find_context("What is the capital of New York? Where is New York?")

{'what': ['what is the capital of new york?'], 'where': ['where is new york?']}

In [75]:
find_context("How many days does it take the moon to revolve around Earth?")

{'how': ['how many days does it take the moon to revolve around earth?']}

In [76]:
find_context("Name all presidents during the great depression")

{'name': ['name all presidents during the great depression?']}

In [77]:
questions = ["What is the capital of New York? Where is New York?",
             "How many days does it take the moon to revolve around Earth?",
             "Name all presidents during the great depression."
]

In [78]:
qtable = [[list(qdict.keys()),list(qdict.values())] for qdict in
          list(map(find_context,questions))]


In [79]:
questions_table = pd.DataFrame(qtable,columns = ["Contexts","Questions"])

In [80]:
questions_table

Unnamed: 0,Contexts,Questions
0,"[what, where]","[[what is the capital of new york?], [where is..."
1,[how],[[how many days does it take the moon to revol...
2,[name],[[name all presidents during the great depress...


In [166]:
answers = [
    "Albany.  United States",
    "27",
    "Herbert Hoover and Franklin Roosevelt"
]

In [221]:
stop_words = stopwords.words('english')
nlp = spacy.load("en_core_web_sm")
stop_words.remove("no")
def get_bigrams(words):
    return [' '.join(item) for item in list(nltk.bigrams(words.lower().split(' ')))]

def get_synonyms(word):
    lst = []
    for syn in wordnet.synsets(word):
        for hypernym in syn.hypernyms()+syn.hyponyms():
            for nym in hypernym.lemmas(): 
                lst.append(nym.name())
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_"," ").lower()
            lst.append(synonym)
    return lst

def process_answer(answer):
    a = re.sub('[^a-zA-Z0-9\?]',' ', answer.lower()).split(' ')
    a = ' '.join([w for w in a if w not in stop_words])
    a = re.sub(' +', ' ',a)
    rel_syns = []
    no_syn_bigrams = []
    for bigram in get_bigrams(a):
        bigram = re.sub(' ','_',bigram)
        synonyms = get_synonyms(bigram)
        flag = 0
        if synonyms != []:
            bg = bigram
            for syn in synonyms:
                if ' ' not in syn and '.' not in syn and '-' not in syn:
                    rel_syns.append([re.sub('_',' ',bg).split(' '),syn])
                    flag = 1
                    break
            if flag == 0:
                no_syn_bigrams.append(re.sub('_',' ',bigram))
    for words,sub in rel_syns:
        flag = 0
        for word in words:
            if word in a:
                a = re.sub(word,'',a)
        a = a + " " + sub
    
    docs = [(nlp(''.join(ans))) for ans in a.split(' ')]
    answers = []
    for doc in docs:
        for token in doc:
            answers.append([token.text, token.lemma_, token.pos_, token.tag_, token.ent_type_, token.dep_,
                    token.shape_, token.is_alpha])        
    return answers

process_answer(answers[1])

[['27', '27', 'NUM', 'CD', 'CARDINAL', 'ROOT', 'dd', False]]

In [251]:
# For now we'll focus on a naive approach - questions where there's only a one word answer

one_word_answer_questions = [
    "Where is Albany?",
    "Who was the first president?",
    "Did America ever land on the moon?",
    "Which year did world war 2 begin?"
]

one_word_answers = [
    "New York",
    "George Washington",
    "Yes",
    "1939"
]

contexts = pd.Series(list(chain.from_iterable(list(map(find_context,one_word_answer_questions)))))

In [256]:
answer_df = pd.DataFrame(list(chain.from_iterable(list(map(process_answer,one_word_answers)))),
                        columns = ["Answer",
                                  "Lemma","POS",
                                  "Tag",
                                  "Entity",
                                  "DEP",
                                  "Shape",
                                  "Is_Alpha"]
                        
                        )
answer_df['Context'] = contexts

In [257]:
answer_df

Unnamed: 0,Answer,Lemma,POS,Tag,Entity,DEP,Shape,Is_Alpha,Context
0,ny,ny,X,LS,,ROOT,xx,True,where
1,washington,washington,PROPN,NNP,GPE,ROOT,xxxx,True,who
2,yes,yes,INTJ,UH,,ROOT,xxx,True,did
3,1939,1939,NUM,CD,,ROOT,dddd,False,which


# Next Steps:

1. Can we successfully train a machine learning model to match each context of a question
with an answer?

To do: Develop a model to match contexts and answers   

Once we can identity certain trigger words with attributes of expected answers
we can scale to that to larger questions.
