In [47]:
import regex as re
from pathlib import Path
import spacy
from spacy import displacy
import pandas as pd
import numpy as np
import coreferee
from sentence_transformers import SentenceTransformer, util
import getopt
import sys
import pandas as pd
from datasets import load_dataset
import json 
import regex as re
from tqdm import tqdm
import nltk




This is the universal dependency for English: https://universaldependencies.org/en/dep/index.html

### Loading Model

In [48]:
nlp = spacy.load('en_core_web_md')
nlp.add_pipe("merge_entities")
nlp.add_pipe("merge_noun_chunks")
nlp.add_pipe('coreferee')
model = SentenceTransformer("all-MiniLM-L6-v2")

doc = """
Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the golden anniversary with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as Super Bowl L), so that the logo could prominently feature the Arabic numerals 50.
"""
# doc = "Farah took a trophy. Farah win a cake."



### resolve coreferences

In [49]:

def resolve_coreference(text):
    doc = nlp(text)
    doc_list = list(doc)
    doc._.coref_chains.print()
    resolving_indecies = []
    for _,item in enumerate(doc._.coref_chains):
        resolving_indecies.extend(item)
        
    for word in resolving_indecies:
        new_word = ""
        for index in word:
            if doc[index]._.coref_chains.resolve(doc[index]) is not None:
                temp = []
                for item in doc._.coref_chains.resolve(doc[index]):
                    temp.append(str(item))
                new_word = ", ".join(temp)
            
                doc_list[index] = new_word

    final_doc = []
    for item in doc_list:
        final_doc.append(str(item))
    return " ".join(final_doc)
        

### Preprocess the document

In [50]:
def preprocess_context(doc):
    text = doc.strip()
    text.replace(".", ",")
    resolved_text = resolve_coreference(text)
    resolved_text = resolved_text.strip()
    resolved_text = resolved_text.replace("  ", " ").replace(" ,", ",").replace(" .", ".").replace("\n", "")
    return resolved_text

In [51]:
resolved_doc = preprocess_context(doc)
cleaned_doc = nlp(resolved_doc)
sentences = [one_sentence.text.strip() for one_sentence in cleaned_doc.sents]




In [88]:
# get all verbs in the sentences
verbs = set()
for sentence in sentences:
    doc = nlp(sentence)
    for token in doc:
        if token.pos_ == "VERB" or token.pos_ == "AUX":
            verbs.add(token.lemma_)
            
verbs
# verbs2 = ["earn"]
# # loop on sentences and get the sentences that have verbs woth lemmas in those [earn, have, suspend]
# sentences_with_verbs = []
# for sentence in sentences:
#     doc = nlp(sentence)
#     for token in doc:
#         if token.lemma_ in verbs2:
#             sentences_with_verbs.append(sentence)
#             break
        
# sentences_with_verbs
# [earn, have, suspend]


{'be',
 'could',
 'defeat',
 'determine',
 'earn',
 'emphasize',
 'feature',
 'have',
 'know',
 'name',
 'play',
 'suspend',
 'would'}

In [53]:
excludesPerQuestionType = {
    "when": "Times",
    "where": "Locations",
    "who": "Subject",
    "what": "Objects",
    "how": "States"
}

In [253]:
question = "What color was used to emphasize the 50th anniversary of the Super Bowl?" 
# answer = "He talked to him to secure the account."
answer = "The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title."
answer = "As this was the 50th Super Bowl, the league emphasized the golden anniversary with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals ( under which the game would have been known as Super Bowl L ), so that the logo could prominently feature the Arabic numerals 50."
answer = "Aly is happy because Ziad is good."
ans_nlp = nlp(answer)
splitted_question = question.split(" ")
question_type = splitted_question[0].lower()
question_nlp = nlp(question)
if question_nlp[0].ent_type_ == "DATE":
    question_type = "when"

question_type

'what'

In [254]:
for token in question_nlp:
    print(token.text, '-',token.pos_,'-', token.dep_,'-', token.ent_type_)

    
displacy.render(question_nlp, style="dep")

What color - NOUN - nsubjpass - 
was - AUX - auxpass - 
used - VERB - ROOT - 
to - PART - aux - 
emphasize - VERB - xcomp - 
the 50th anniversary - NOUN - dobj - 
of - ADP - prep - 
the Super Bowl - PROPN - pobj - EVENT
? - PUNCT - punct - 


In [255]:
for token in ans_nlp:
    
    print(token.text, '-',token.pos_,'-', token.dep_,'-', token.ent_type_)
        

displacy.render(ans_nlp, style="dep")

Aly - PROPN - nsubj - PERSON
is - AUX - ROOT - 
happy - ADJ - acomp - 
because - SCONJ - mark - 
Ziad - PROPN - nsubj - PERSON
is - AUX - advcl - 
good - ADJ - acomp - 
. - PUNCT - punct - 


### Extracting the entities

In [258]:
def extract_subjects(sentence):
    subjects = {}
    
    for token in sentence:
        if token.pos_ == "VERB" or token.pos_ == "AUX":
            subjectFlag = False
            verb = token
            for child in token.children:
                if child.dep_ in ("nsubj", "csubj"):
                    subtree_tokens = [str(t) for t in child.subtree]
                    subjects[token] = " ".join(subtree_tokens)
                    subjectFlag = True
                elif child.dep_ == "nsubjpass":
                    for child in verb.children:
                        if child.dep_ == "agent" and len(list(child.children)) > 0:
                            subject = [str(t) for t in list(child.children)[0].subtree]
                            subject = " ".join(subject)
                            break
                        else:
                            subject = "Unknown"
                    subjects[verb] = subject
                    subjectFlag = True
            if not subjectFlag:  # didn't find a normal subject
                if token.dep_ == "relcl":
                    subject = str(token.head)
                    subjects[token] = subject  # should get the subtree of the subject
                elif token.dep_ in ("advcl", "conj"):
                    verb = token.head
                    subjects[token] = subjects[verb]
                elif token.dep_ == "xcomp":
                    verb = token.head
                    subjects[token] = subjects[verb]
                    for child in verb.subtree:
                        if child.dep_ in ("dobj", "dative", "pobj"):
                            subtree_tokens = [str(t) for t in child.subtree]
                            subjects[token] = " ".join(subtree_tokens)
                            break
                        
                    
                    
    return [(v, k) for k, v in subjects.items()]
                        
                            
# def extract_subjects(sentence):
#     subjects = []
#     for token in sentence:
#         if token.dep_ in ("nsubj","csubj", "nsubjpass","relcl"):
#             if token.dep_ == "relcl":
#                 # The head of the token with the "relcl" dependency is the noun that the relative clause is modifying
#                 subject = str(token.head)
#                 verb = token
#                 subjects.append((subject, verb))
#             elif token.dep_ == "nsubjpass":
#                 verb = token.head
#                 for child in verb.children:
#                     if child.dep_ == "agent" and len(list(child.children)) > 0:
#                         subject = [str(t) for t in list(child.children)[0].subtree]
#                         subject = " ".join(subject)
#                         break
#                     else:
#                         subject = "Unknown"
#                 subjects.append((subject, verb))
#             else:                       
#                 subtree_tokens = [str(t) for t in token.subtree]
#                 verb = token.head
#                 subjects.append((" ".join(subtree_tokens), verb))
#     return subjects

def extract_objects(sentence):
    objects = []
    for token in sentence:
        if token.dep_ in ("dobj", "dative", "attr", "oprd", "acomp","ccomp", "xcomp", "nsubjpass"):
            subtree_tokens = [str(t) for t in token.subtree]
            verb = token.head
            objects.append((" ".join(subtree_tokens), verb))
    return objects

def extract_state(sentence):
    states = []
    for token in sentence:
        if token.pos_ =="VERB" or token.pos_ == "AUX":
            for child in token.children:
                if child.dep_ == "prep":
                    subtree_tokens = [str(t) for t in child.subtree]
                    states.append(((" ".join(subtree_tokens), token)))
    return states

def extract_time(sentence):
    times = {}
    for token in sentence:
        if token.pos_ == "VERB" or token.pos_ == "AUX":
            for child in token.subtree:
                if child.ent_type_ == "DATE" or child.ent_type_ == "TIME":
                    times[child.text] = token
    return list(times.items())

def extract_location(sentence):
    locations = {}
    for token in sentence:
        if token.pos_ == "VERB" or token.pos_ == "AUX":
            for child in token.subtree:
                if child.ent_type_ in ("GPE", "LOC", "FAC"):
                    locations[child.text] = token
    return list(locations.items())
                    

def extract_facts(sentence):
    sentence = nlp(sentence)
    states = extract_state(sentence)
    subjects = extract_subjects(sentence)
    objects = extract_objects(sentence)
    times = extract_time(sentence)
    locations = extract_location(sentence)
    print("Subjects: ",subjects, objects)
    print("states: " ,states, "times: ", times,"locations: ", locations)
    
    facts = pd.DataFrame(columns=["Subject", "Relation", "Objects", "States", "Times", "Locations"])
    
    for subject in subjects: #(Ziad, is), (Aly, is)
        verb = subject[1].lemma_
        currentSubject = subject[0]
        if verb in facts["Relation"].values:
            facts.loc[facts["Relation"] == verb, "Subject"] = currentSubject
        else:
            new_row = pd.DataFrame([{"Subject": currentSubject, "Relation": verb, "Objects": [], "States": [], "Times": [], "Locations": []}])
            facts = pd.concat([facts, new_row], ignore_index=True)
    for obj in objects:
        verb = obj[1].lemma_
        currentObj = obj[0]
        if verb in facts["Relation"].values:
            oldObjects = list(facts.loc[facts["Relation"] == verb, "Objects"].values[0])
            oldObjects.append(currentObj)
            facts.loc[facts["Relation"] == verb, "Objects"] = [oldObjects] 
            

    for state in states:
        verb = state[1].lemma_
        currentState = state[0]
        if verb in facts["Relation"].values:
            oldStates = list(facts.loc[facts["Relation"] == verb, "States"].values[0])
            oldStates.append(currentState)
            facts.loc[facts["Relation"] == verb, "States"] = [oldStates]
            
    for time in times:
        verb = time[1].lemma_
        currentTime = time[0]
        if verb in facts["Relation"].values:
            oldTimes = list(facts.loc[facts["Relation"] == verb, "Times"].values[0])
            oldTimes.append(currentTime)
            facts.loc[facts["Relation"] == verb, "Times"] = [oldTimes]
            
    for location in locations:
        verb = location[1].lemma_
        currentLocation = location[0]
        if verb in facts["Relation"].values:
            oldLocations = list(facts.loc[facts["Relation"] == verb, "Locations"].values[0])
            oldLocations.append(currentLocation)
            facts.loc[facts["Relation"] == verb, "Locations"] = [oldLocations]
            
    return facts

In [259]:
extract_facts(answer)

Subjects:  [('Aly', is), ('Ziad', is)] [('happy', is), ('good', is)]
states:  [] times:  [] locations:  []


Unnamed: 0,Subject,Relation,Objects,States,Times,Locations
0,Ziad,be,"[happy, good]",[],[],[]


In [247]:
questionDF = extract_facts(question)

Subjects:  [('Unknown', used), ('the 50th anniversary of the Super Bowl', emphasize)] [('What color', used), ('to emphasize the 50th anniversary of the Super Bowl', used), ('the 50th anniversary of the Super Bowl', emphasize)]
states:  [] times:  [] locations:  []


In [248]:
def join_sentences_facts(sentences):
    all_facts = pd.DataFrame(columns=["Subject", "Relation", "Objects", "States", "Times", "Locations"])
    for sentence in sentences:
        facts = extract_facts(sentence)
        all_facts = pd.concat([all_facts, facts])
    all_facts = all_facts.groupby(["Subject", "Relation"], as_index=False).agg({
        "Objects": lambda x: [item for sublist in x for item in sublist],
        "States": lambda x: [item for sublist in x for item in sublist],
        "Times": lambda x: [item for sublist in x for item in sublist],
        "Locations": lambda x: [item for sublist in x for item in sublist]
    })
    return all_facts

In [249]:
factsDF = join_sentences_facts(sentences)

Subjects:  [('Super Bowl 50', was), ('an American football game', determine)] [('an American football game to determine the champion of the National Football League ( NFL ) for the 2015 season', was), ('the champion of the National Football League ( NFL )', determine)]
states:  [('for the 2015 season', determine)] times:  [('the 2015 season', determine)] locations:  []
Subjects:  [('The American Football Conference (AFC) champion Denver Broncos', defeated), ('The American Football Conference (AFC) champion Denver Broncos', earn)] [('the National Football Conference (NFC) champion Carolina Panthers 24–10', defeated), ('their third Super Bowl title', earn)]
states:  [] times:  [] locations:  []
Subjects:  [('Unknown', played)] [('The game', played)]
states:  [('on February 7, 2016', played), ("at Levi's Stadium in the San Francisco Bay Area at Santa Clara , California", played)] times:  [('February 7, 2016', played)] locations:  [('the San Francisco Bay Area', played), ('Santa Clara', pl

In [250]:
# iterate the dataframe and change the subject to [Subject] and the relation to [Relation]
def change_subject_relation(factsDF):
    for index, row in factsDF.iterrows():
        factsDF.loc[index, "Subject"] = [row['Subject']]
        factsDF.loc[index, "Relation"] = [row['Relation']]
    return factsDF

In [251]:
newFactsDF = change_subject_relation(factsDF)
#save in csv
newFactsDF.to_csv("facts2.csv")

In [252]:
newFactsDF

Unnamed: 0,Subject,Relation,Objects,States,Times,Locations
0,[Super Bowl 50],[be],[an American football game to determine the ch...,[],[],[]
1,[The American Football Conference (AFC) champi...,[defeat],[the National Football Conference (NFC) champi...,[],[],[]
2,[The American Football Conference (AFC) champi...,[earn],[their third Super Bowl title],[],[],[]
3,[Unknown],[know],[the game],"[under which, as Super Bowl L]",[],[]
4,[Unknown],[play],[The game],"[on February 7, 2016, at Levi's Stadium in the...","[February 7, 2016]","[the San Francisco Bay Area, Santa Clara, Cali..."
5,[an American football game],[determine],[the champion of the National Football League ...,[for the 2015 season],[the 2015 season],[]
6,[the league],[emphasize],[the golden anniversary with various gold-them...,[],[],[]
7,[the league],[suspend],[the tradition of naming each Super Bowl game ...,[],[],[]
8,[the logo],[feature],"[the Arabic numerals, 50]",[],[],[]
9,[this],[be],[the 50th Super Bowl],[],[],[]


In [65]:
newQuestionDF = change_subject_relation(questionDF)
newQuestionDF

Unnamed: 0,Subject,Relation,Objects,States,Times,Locations
0,[Unknown],[use],"[What color, to emphasize the 50th anniversary...",[],[],[]


In [66]:
def similarity(factRow, questionRow, column):
    if len(factRow[column]) == 0 or len(questionRow[column]) == 0:
        return 0
    columnString = " ".join(factRow[column])
    questionString = " ".join(questionRow[column])
    embeddingFact = model.encode(columnString)
    embeddingQuestion = model.encode(questionString)
    return util.cos_sim(embeddingFact, embeddingQuestion)

In [67]:
def cost_function(factsDf, questionFact, excludeColumns=[]):
    score = 0
    maxFactIdx = 0
    columnNames = ["Subject","Relation", "Objects", "States", "Times", "Locations"]
    for column in excludeColumns:
        columnNames.remove(column)
    for factIdx, factRow in factsDf.iterrows():
        currScore = 0
        for _, questionRow in questionFact.iterrows():
            if len(factRow[excludeColumns[0]]) == 0:
                continue
            for column in columnNames:
                currScore += similarity(factRow, questionRow, column)
        if currScore > score:
            score = currScore
            maxFactIdx = factIdx
    return maxFactIdx, score
        

In [68]:
correctIdx, _ = cost_function(factsDF, questionDF, excludeColumns=[excludesPerQuestionType[question_type]])
WhenAnswer = factsDF.loc[correctIdx, excludesPerQuestionType[question_type]]
if WhenAnswer == []:
    WhenAnswer = factsDF.loc[correctIdx, "States"]    
" ".join(WhenAnswer)

'The game'

In [69]:
model = SentenceTransformer("all-MiniLM-L6-v2")



In [70]:
emb1 = model.encode("the champion of the National Football League ( NFL )")
emb2 = model.encode("place")

cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim.item())

Cosine-Similarity: 0.014753999188542366


In [71]:
nlp.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')