In [1]:
#%%capture
#!pip install wikipedia
#!pip install yake
#!pip install --upgrade ecommercetools
#!pip install pattern
#!pip install textacy

# Import packages
import wikipedia
import re
import yake
import nltk #For some reason, had to uninstall and reinstall
import numpy as np
import pandas as pd
from ecommercetools import seo
#from pattern.text.en import singularize, pluralize #issues here
#For Google Knowledge Graph API
import requests
import urllib
import json
import os
from pattern.text.en import singularize, pluralize
from requests_html import HTML
from requests_html import HTMLSession
#if scraping paragraphs from first few webpages
from bs4 import BeautifulSoup
#For question generation
import spacy
import textacy
#The commented lines below only need run the first time
# !pip install git+https://github.com/uob-vil/pattern.git
# !python -m spacy download en_core_web_sm
# nltk.download('averaged_perceptron_tagger')
# nltk.download('omw-1.4')
# nlp = spacy.load("en_core_web_sm")
pd.set_option('max_colwidth', 400)

In [3]:
def ending_pluralize(noun):
    '''Return most appropriate plural of the input word.'''
    if re.search('[sxz]$', noun):
        return re.sub('$', 'es', noun)
    elif re.search('[^aeioudgkprt]h$', noun):
        return re.sub('$', 'es', noun)
    elif re.search('y$', noun):
        return re.sub('y$', 'ies', noun)
    else:
        return noun

def add_s_pluralize(noun):
    '''Naively add s to end of input word to create plural'''
    return noun + 's'
  
def tidy_input(input):
    '''Take input word and tidy it up to create a list of options.
    
    We have a few different pluralize functions just to account for any
    misspellings online/words created when punctuation removed.
    '''
  
    input_words = input.split()
  
    #Add singular forms of plurals and plural forms of singles 
    singles = [singularize(plural) for plural in input_words]
    plurals1 = [pluralize(single) for single in singles]
    plurals2 = [ending_pluralize(single) for single in singles]
    plurals3 = [add_s_pluralize(single) for single in singles]
    input_words = input_words + singles + plurals1 + plurals2 + plurals3
  
    input_words = input_words + [word.lower() for word in input_words]
    #If you want capitalized words as well
    input_words = input_words + [word[0].upper() + word[1:] for word in input.split()]
    input_words = input_words + [word.upper() for word in input_words]
  
    input_words = list(set(input_words))
    
    return(input_words)
  
#Next few functions sourced from https://practicaldatascience.co.uk/data-science/how-to-access-the-google-knowledge-graph-search-api
  
def get_source(url):
    """Returns the source code for the provided URL. 
  
    Parameters
      ----------
    url (string): URL of the page to scrape.
  
    Returns
    -------
    response (object): HTTP response object from requests_html. 
    """
  
    try:
        session = HTMLSession()
        response = session.get(url)
        return response
  
    except requests.exceptions.RequestException as e:
        print(e)

def get_knowledge_graph(api_key, query):
    """Return a Google Knowledge Graph for a given query.

    Parameters
    ---------- 
    api_key (string): Google Knowledge Graph API key. 
    query (string): Term to search for.

    Returns
    -------
    response (object): Knowledge Graph response object in JSON format.
    """ 

    endpoint = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {
      'query': query,
      'limit': 10,
      'indent': True,
      'key': api_key,
    }

    url = endpoint + '?' + urllib.parse.urlencode(params)    
    response = get_source(url)

    return json.loads(response.text)
  
def get_knowledge_graph_df(input):
    """
    Uses Google's knowledge graph to generate Pandas DataFrame of entities 
    deemed most similar to input searched. DataFrame includes categorization
    of entity, title, short description and URL (usually to Wikipedia).
    You will need to have set up an API key in Google Cloud Console to get this
    to work (it's free to do and you can do 100k requests a day I believe.)
    https://console.cloud.google.com/apis 
    Parameters
    ----------
    input (string): Final Linkee answer
  
    Returns
    -------
    knowledge_graph_df (Pandas DataFrame): info on Knowledge Graph results
    """
    threshold=0.2
    api_key = os.environ['GOOGLE_LINKEE_KEY']
    knowledge_graph_json = get_knowledge_graph(api_key, input)
    knowledge_graph_df = pd.json_normalize(knowledge_graph_json, record_path='itemListElement')
    return knowledge_graph_df
    #Only using scores if knowledge graph actually returns something
    if len(knowledge_graph_df) > 0:
        max_score = max(knowledge_graph_df['resultScore'])
        knowledge_graph_df = knowledge_graph_df.loc[knowledge_graph_df['resultScore']>threshold*max_score]
        index_match = knowledge_graph_df.index[knowledge_graph_df['result.name'] == input]
        if len(index_match) == 1:
            n = index_match[0]
            knowledge_graph_df = pd.concat([knowledge_graph_df.iloc[[n],:], knowledge_graph_df.drop(n, axis=0)], axis=0)
            knowledge_graph_df.reset_index(inplace = True, drop = True )
    return knowledge_graph_df
  
def classify_input(knowledge_graph_df):
    """Classify the input word/phrase as a certain category 
    to improve search results. Acts as failsafe if initial search
    of input fails.
  
    Parameters
    ----------
    knowledge_graph_df: Return of get_knowledge_graph_df
  
    Returns
    -------
    category (string): Category of input
    
    """
    if "SportsTeam" in knowledge_graph_df['result.@type'][0]:
        entity_tags = knowledge_graph_df['result.@type'][1]
    else:
        entity_tags = knowledge_graph_df['result.@type'][0]
    
    if ("Movie" in entity_tags) or ("MovieSeries" in entity_tags):
        category = "Movie"
    elif ("TVEpisode" in entity_tags) or ("TVSeries" in entity_tags):
        category = "TV"
    elif ("VideoGame" in entity_tags) or ("VideoGameSeries" in entity_tags):
        category = "VideoGame"
    elif ("Book" in entity_tags) or ("BookSeries" in entity_tags):
        category = "Book"
    elif "Person" in entity_tags:
        category = "Person"
    elif ("MusicAlbum" in entity_tags) or ("MusicGroup" in entity_tags) or ("MusicRecording" in entity_tags):
        category = "Music"
    elif ("Place" in entity_tags) or ("AdministrativeArea" in entity_tags):
        category = "Place" 
    else:
        category = "Thing"
  
    return(category)
  
def tailored_search(category, input):
    """Change the search to get better keywords for input, based on its category
  
    Parameters
    ----------
    category (string): Category of input
    input (string): Final Linkee answer
  
    Returns
    -------
    search_input (string): Search term to use to find keywords
    
    """
    if category == "Movie" or category == "TV" or category == "Book":
        search_input = input + " " + category + " information"
    elif category == "Place":
        search_input = input + " location"
    else:
        search_input = input
    return(search_input)
  
def collect_urls(knowledge_graph_df):
    """Collect the urls from the knowledge graph to give more options to scrape 
    from.
  
    Parameters
    ----------
    knowledge_graph_df: Return of get_knowledge_graph_df
  
    Returns
    -------
    list of urls (string): urls found
    
    """
    if 'result.detailedDescription.url' in knowledge_graph_df.columns:
        knowledge_graph_df = knowledge_graph_df[knowledge_graph_df['result.detailedDescription.url'].notna()]
        urlList = knowledge_graph_df['result.detailedDescription.url'].tolist()
    else:
        urlList = []
    return urlList

def get_wiki_links(urlList):
    '''Extract the URLs linking to Wikipedia from a list of URLs'''
    url_wiki=[urlList[i] for i in range(len(urlList)) if urlList[i].find("wiki")!= -1]
    # if len(url_wiki) == 0:
    #   print('No Urls')
    # if url_wiki:
    return(url_wiki)
  
def get_wiki_text(url_wiki, keep_words=10000):
    '''
    Takes a list of urls and scrapes from Wikipedia links
    if present.
    Parameters
    ----------
    urlList (list) : a list of wikipedia urls
    keep_words (integer): the number of words to keep (approx up to paragraph)
  
    Returns
    -------
    text_comb (string): the text extracted from the paragraphs until word limit 
                          reached
    '''
    text_comb = ''
    total_words = 0
    key_url_terms  = []
    if len(url_wiki)> 1:
        url_terms = [url.split('/wiki/')[1] for url in url_wiki[1:]] 
        # limiting to 3 max by the fact the url should be limited like that anyway
        for s in url_terms:
            s = s.replace("_"," ")
            key_term = s.translate(str.maketrans('', '', string.punctuation))
            key_term = " ".join(key_term.split())
            key_url_terms.append(key_term)
            # print('wiki url', key_url_terms)
        # print('url_wiki',url_wiki)
    for url in url_wiki:
        wiki_term = url.split('/wiki/')[1]
        print(f"Looking at wiki page for: {wiki_term}")
        try:
            text_wiki = (wikipedia.page(wiki_term, auto_suggest = False)).content
        except KeyError: #fullurl errors can be caused by unicode or other symbols
            text_wiki = (wikipedia.page(wiki_term, auto_suggest = True)).content
        #This will drop headers surrounded by ==
        text_wiki = re.sub(r'==.*?==+', '', text_wiki)
        paras = text_wiki.split('\n\n')
        word_count = len(paras[0].split()) #number of words in 1st paragraph
        remaining_words = keep_words - total_words
        j = 0
        text = paras[0]
        while word_count < remaining_words and j<len(paras)-1:
            j += 1
            para_text = paras[j]
            word_count = word_count + len(para_text.split())
            text = text + ' ' + para_text
        #Drop new line /n clutter
        text = text.replace('\n', ' ')
        text = re.sub("\s\s+", " ", text)
        text_comb = text_comb + text # change if want more than one
        total_words = total_words + word_count
        if total_words >= keep_words:
            break  # break out of for loop when we have enough words

    return text_comb, key_url_terms
  
def wiki_autosuggest(input, keep_words = 10000, suggest = False):
    ''' 
    Gets text from Wikipedia using whichever page is found and cleans up the text
  
    Parameters
    ----------
    input (string): original input word
    keep_words (integer): number of words to keep
    suggest (Boolean): suggest = True means use wikipedia autosuggest function,
                       False takes the input as is to find the page
  
    Returns
    -------
    text (string): text that has been cleaned up
               
    '''
    # Get text from single wikipedia page
    try:
        text_wiki = (wikipedia.page(input, auto_suggest = suggest)).content
    except Exception as err:
        print(err.args)
        raise ValueError(f'No urls found for {input}') 
    #if no exception raised clean up text
    #This will drop headers surrounded by ==
    text_wiki = re.sub(r'==.*?==+', '', text_wiki)
    paras = text_wiki.split('\n\n')
    word_count=len(paras[0].split()) #number of words in 1st paragraph
    j=0
    text = paras[0]
    while word_count < keep_words and j<len(paras)-1:
        j += 1
        para_text = paras[j]
        word_count = word_count + len(para_text.split())
        text = text + ' ' + para_text
    #Drop new line /n clutter
    text = text.replace('\n', ' ')
    text = re.sub("\s\s+", " ", text)
    return text
  
def find_text(input, keep_words=10000, cleanup=True, multi_links = True): 
    '''  
    Finds text related to input that can be used for keyword extraction. 
    The text is found in the following order, using wikipedia directly without 
    autosuggest (as autosuggest can sometimes have a weird error, e.g Belfast),
    wikipedia using autosuggest, then tries using knowledge graph to find wiki links
    This function attempts to clean up the relevant text if cleanup is set to True.
  
    Parameters
    ----------
  
    input (string): input word (final answer in Linkee)
    keep_words (integer): the number of words (+to end of paragraph) to keep in text.
    multi_links (Boolean): flag for using more than one wiki page
    text (string): the block of text extracted from wiki
  
    Returns
    -------
  
    key_url_terms (list): the page names of any pages used to add to keywords
    '''
    key_url_terms = []
    try:
        text = wiki_autosuggest(input, keep_words = keep_words, suggest = False)
    except Exception as ex:
        print(f"failed with {ex}")
        try:
            text = wiki_autosuggest(input, keep_words = keep_words, suggest = True)
        except Exception as e:
            print(f"failed with {e}")
            knowledge_graph_df = get_knowledge_graph_df(input)
            if len(knowledge_graph_df) == 0:
                print("No valid keyword")
                # print("nothing found using knowledge graph, trying wiki")
                return "No valid keyword", []
                #keyword vs 
            else:
                # Try to get wiki pages from knowledge graph
                urlList = collect_urls(knowledge_graph_df)
                url_wiki = get_wiki_links(urlList)
                if len(url_wiki) >= 1:
                    keep = min(len(url_wiki), 3)
                    url_wiki = url_wiki[0:keep]
                    if multi_links == False:
                        url_wiki = url_wiki[0:1]
                        text, key_url_terms = get_wiki_text(url_wiki, keep_words)
                else: 
                    #Use the knowledge graph categories to find wikipedia url
                    print(" No wiki urls: 1st pass")
                    category = classify_input(knowledge_graph_df)
                    search_input = tailored_search(category, input)
                    print(f"Searching for urls with input {search_input}")
                    urlList = collect_urls(get_knowledge_graph_df(search_input))
                    url_wiki = get_wiki_links(urlList)
                    if len(url_wiki) >= 1:
                        keep = min(len(url_wiki), 3)
                        url_wiki = url_wiki[0:keep]
                        if multi_links == False:
                            url_wiki = url_wiki[0:1]
                        text, key_url_terms  = get_wiki_text(url_wiki, keep_words)
                    else:
                        print("no wiki pages found")
                        return "No valid keyword", []
                        # print(" No wiki urls: 2nd pass")
                        # text = wiki_autosuggest(input)
    
  
    #Text Cleaning
    text = re.sub(r"\'", '', text) #Get rid of \'
    text = re.sub(r"\\xa0...", '', text) #Get rid of \\xa0...
    text = re.sub(r"\\n", ' ', text) #Get rid of \\n
    text = re.sub(r"\\u200e", ' ', text) #Get rid of \\u200e
    text = re.sub(r"U S ", "US ", text)
    text = re.sub(r"logo", '', text)
    text = re.sub(r"[Vv]iew \d+ more rows", '', text) #Get rid of [Vv]iew \d+ more rows
    text = re.sub(r"\d+ hours ago", '', text)
    #Remove things like "2009.Power" - no space after full stop
    rx = r"\.(?=[A-Za-z])"
    text = re.sub(rx, ". ", text)
    if cleanup == True:
        text = re.sub(r"[\"\'\“\”\[\]\)\(\•\▽\❖\†]+", '', text)
        text = re.sub(r"[-·—,.;:@#?!$+-]+", ' ', text) 
    
    text = ' '.join(text.split()) #Single spacing
  
    return text, key_url_terms

def keyword_extract(text, ngram_size):
    '''Extract keywords/phrases of ngram_size using YAKE'''
    #Initialise extractor
    kw_extractor = yake.KeywordExtractor()
    language = "en"
    max_ngram_size = ngram_size
    deduplication_threshold = 0.3
    numOfKeywords = 100
    custom_kw_extractor = yake.KeywordExtractor(lan=language, 
                                                n=max_ngram_size, 
                                                dedupLim=deduplication_threshold, 
                                                top=numOfKeywords, features=None)
    
    #Run extractor on text and get out words/phrases
    yake_output = custom_kw_extractor.extract_keywords(text)
    words, scores = zip(*yake_output)
    words = list(words)
    scores = list(scores)
    words = [re.sub(r"[,.;@#?!$]+", ' ', i) for i in words]
    return(words,scores)
  
def answer_keyword_compare(keywords_list, input_words):
    '''Remove candidate keywords that contain input words'''
  
    keywords_list = [x for x in keywords_list if not any(i in input_words for i in x.split())]
    return keywords_list
  
def remove_non_noun_full_keywords(keywords_list):
    '''
    Only retain keywords/keyphrases that are proper nouns.
    '''
    pos = nltk.pos_tag(keywords_list)
    new_keyword_list = []
    for ii in np.arange(0,len(pos),1):
        if pos[ii][1]=='NNP':
            new_keyword_list.append(pos[ii][0])
        if pos[ii][1]=='NNPS':
            new_keyword_list.append(pos[ii][0])
    return new_keyword_list
  
def select_keywords(words2):
    '''
    Selects the keywords/phrases to use for question generation. Ensures that 
    keyword phrases do not overlap each other.
    '''
    words3 = []
    words3.append(words2[0])
    del words2[0]
    for i in range(len(words2)):
        #If at any point, we only have 4 candidate keywords left, use them all
        if len(words2) + len(words3) <= 4:
            words3 = words3 + words2
            break
        test_words = words2[0].lower().split()
        singles = [singularize(plural) for plural in test_words]
        plurals1 = [pluralize(single) for single in singles]
        plurals2 = [ending_pluralize(single) for single in singles]
        plurals3 = [add_s_pluralize(single) for single in singles]
        test_words = list(set(test_words + singles + plurals1 + plurals2 + plurals3))
        previous_words = words3.copy()
        previous_words = [word for phrase in previous_words for word in phrase.split()]
        previous_words = [x.lower() for x in previous_words]
        if len(test_words) + len(previous_words) == len(list(set(test_words + previous_words))):
            words3.append(words2[0])
        del words2[0]
    return(words3)

In [4]:
get_knowledge_graph_df("France")

Unnamed: 0,resultScore,@type,result.@id,result.image.url,result.image.contentUrl,result.name,result.url,result.description,result.detailedDescription.url,result.detailedDescription.license,result.detailedDescription.articleBody,result.@type
0,4592.87207,EntitySearchResult,kg:/m/01l3vx,https://commons.wikimedia.org/wiki/File:France_national_football_team_2018.jpg,https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcQAdbRZZPTHrhz24xj3Xdzf6RGl3MOdTQJFwoBfz8eI0mFJYhiJ,France national football team,http://www.fff.fr/bleus/,Football team,https://en.wikipedia.org/wiki/France_national_football_team,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"The France national football team represents France in men's international football and is controlled by the French Football Federation, also known as FFF. The team's colours are blue, white, and red, and the coq gaulois its symbol. France are colloquially known as Les Bleus. They are the reigning world champions, having won the most recent World Cup final in 2018.\n","[Thing, SportsTeam]"
1,2954.216553,EntitySearchResult,kg:/m/044hxl,https://commons.wikimedia.org/wiki/File:Ligue1.svg,https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSP_jf42o1P3eB4PidA5AV6ZurulDZyEUV5ES6xLOCqB3_1dArr,Ligue 1,http://www.ligue1.com/,Football league,https://en.wikipedia.org/wiki/Ligue_1,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"Ligue 1, officially known as Ligue 1 Uber Eats for sponsorship reasons, is a French professional league for men's association football clubs. At the top of the French football league system, it is the country's primary football competition.","[SportsOrganization, Organization, Corporation, Thing]"
2,2823.790039,EntitySearchResult,kg:/m/04y8tkw,https://commons.wikimedia.org/wiki/File:Leroy_Merlin.svg,https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcTBzsi3jT4FGBK_CDRu9ysOk4Nfm7XETAE_cSPRsdmsITXcLkLL,Leroy Merlin,http://www.leroymerlin.com/,Retail company,https://en.wikipedia.org/wiki/Leroy_Merlin,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"Leroy Merlin is a French headquartered home improvement and gardening retailer serving several countries in Europe, Asia, South America, and Africa.","[Organization, Thing, Corporation]"
3,2259.376709,EntitySearchResult,kg:/m/0jd05,https://fr.m.wikipedia.org/wiki/Fichier:Orange_logo.svg,https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRkC7L1ooTL5B_MXf04rHuYIGcVJk_H2WzBCUS84PxC96qgSbpM,Orange S.A.,http://www.orange.com,Telecom company,https://en.wikipedia.org/wiki/Orange_S.A.,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"Orange S.A., rebranded as Orange, formerly France Télécom S.A., stylized as france telecom, is a French multinational telecommunications corporation. It has 266 million customers worldwide and employs 89,000 people in France, and 59,000 elsewhere.","[Organization, Thing, Corporation]"
4,2098.62793,EntitySearchResult,kg:/m/0h7k5,https://commons.wikimedia.org/wiki/File:Air_France_logo_(1970%27s-2008).svg,https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSkQIkHn6SO7PF9BTu-WG-YZK8Q8XsGbuyY0U3yN7waKPInlIbt,Air France,http://www.airfrance.com,Air carrier,https://en.wikipedia.org/wiki/Air_France,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"Air France, stylised as AIRFRANCE, is the flag carrier of France headquartered in Tremblay-en-France. It is a subsidiary of the Air France–KLM Group and a founding member of the SkyTeam global airline alliance.","[Organization, Corporation, Thing, Airline]"
5,1986.733521,EntitySearchResult,kg:/m/028vbm,https://fr.m.wikipedia.org/wiki/Fichier:Caa-com_rvb.png,https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRVAwirPPbMKnvDJTO-iSGkX9MDdTbWrW3vT5Exg2S9s0aMgW_l,Crédit Agricole,http://www.credit-agricole.com/,Bank,https://en.wikipedia.org/wiki/Cr%C3%A9dit_Agricole,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"Crédit Agricole Group, sometimes called La banque verte due to its historical ties to farming, is a French international banking group and the world's largest cooperative financial institution.","[Organization, Thing, Corporation]"
6,1700.724854,EntitySearchResult,kg:/g/122044ns,https://commons.wikimedia.org/wiki/File:Logo_Maisons_du_Monde_FR.png,https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTIfaTLFxMfJDJ2fQV6aHV8T9ntL7y2M9cPG2k2kGbaB6WvSNYM,Maisons du Monde,,Company,https://en.wikipedia.org/wiki/Maisons_du_Monde,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"Maisons du Monde is a French furniture and home decor company founded in Brest in 1996 by Xavier Marie. At the end of 2015 it had nearly 250 stores across France, Italy, Spain, Luxembourg, Belgium, Germany and in Switzerland, of which more than 180 are in France.","[Organization, Thing, Corporation]"
7,1591.487793,EntitySearchResult,kg:/m/03y4ty,,,Électricité de France,http://www.edf.com,Nuclear electric power generation company,https://en.wikipedia.org/wiki/%C3%89lectricit%C3%A9_de_France,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"Électricité de France S.A., commonly known as EDF, is a French multinational electric utility company, largely owned by the French state.","[Organization, Thing, Corporation]"
8,1367.850952,EntitySearchResult,kg:/m/054yn8,https://pl.wikipedia.org/wiki/Plik:G%C5%82og%C3%B3w-Castorama.jpg,https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRw4QkUYiO2XK6Ljq3jR2FQyeds78fiUsIkPhSBJtXhMV9gLb_d,Castorama,http://www.castorama.fr/store/,Retail company,https://en.wikipedia.org/wiki/Castorama,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"Castorama is a French retailer of DIY and home improvement tools and supplies, headquartered in Templemars, France, and is part of the British group Kingfisher plc, which has 101 stores in France and 90 in Poland.","[Organization, Place, Thing, Corporation]"
9,1352.480469,EntitySearchResult,kg:/g/11nxplyjt9,https://en.wikipedia.org/wiki/Miss_France_2022,https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTZsq_7Ew1owg3tGnlSyoieYAhqU1YDeI3awloPpuzZUWRoZyw8,Miss France 2022,,Competition,https://en.wikipedia.org/wiki/Miss_France_2022,https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License,"Miss France 2022 was the 92nd edition of the Miss France pageant. The competition was held on 11 December 2021 at the Zénith de Caen in Caen, Normandy.","[TouristAttraction, Event, Thing]"


In [7]:
def linkee_keywords(input):
    """
    Main pipeline function which takes input and generates list of keywords using 
    wikipedia scraping and NLP.
  
    Parameters
    ---------
    input (string): the input word which is the final Linkee answer
  
    Returns
    -------
    final_keywords (list): the list of possible keywords
    """
    answer_list = tidy_input(input)
  
    #Keyword extraction
    text, key_url_terms = find_text(input)
    
    #Potentially here combine 1-gram, 2-gram, 3-gram results
    words_df = pd.DataFrame()
    words = (keyword_extract(text, 2))[0] #+ (keyword_extract(text, 1))[0] + (keyword_extract(text, 3))[0]
    scores = (keyword_extract(text, 2))[1] #+ (keyword_extract(text, 1))[1] + (keyword_extract(text, 3))[1]
    words_df['words'] = words
    words_df['scores'] = scores
    words_df.sort_values(by=['scores'],ascending=False)
    words_df = words_df[:100]
  
    # Adding wikipedia page names to the words found at the top
    words = key_url_terms + words
  
    #Cleaning up returned keywords
    words2 = answer_keyword_compare(words, answer_list)
  
    words2 = remove_non_noun_full_keywords(words2)
  
    final_keywords = select_keywords(words2)
  
    return(final_keywords)

In [19]:
#Cleanup function to clean up the fact at the end
def cleanup(s):
    """ Cleans up string by removing certain characters """
    strip_refs = re.compile("\.?\[\d+\]?")
    s = strip_refs.sub("", s).strip()
    
    # Pretty ugly
    if s[-1] == ".":
        s = s[0:-1]
        
    return s

def fill_in_blank_q_generate(final_input, input, facts = 1):
    #Might break if only 1 fact possible and not 2 - needs fixed
    """
    Generate a fill in the blank style question using 1 or 2 
    facts about the input.

    Parameters
    ----------

    final_input (string): original input (final answer) for card
    input (string): answer/keyword which we want to generate question for
    facts (integer 1/2): allows to keep 1 or 2 facts for each answer

    Returns
    -------
    question ("string"): the fill in the blank question
    num_statements (integer): the number of statements found corresponding to the 
                            input
    """
    final_input = tidy_input(final_input)

    if (facts < 1) or (facts > 2):
        return("Invalid entry. Please specify if you want 1 or 2 facts.")
    page_name = input

    #Get Wikipedia page and text
    #using knowledge graph method
    text, key_url = find_text(page_name, keep_words= 100000, cleanup = False, multi_links = False)
    print(page_name)
    # text = (wikipedia.page(page_name,auto_suggest=False)).content 
    text = re.sub(r'==.*?==+', '', text)
    #text = text.replace('\n', '')
    text = text.replace('\n', ' ')
    text = re.sub("\s\s+", " ", text)

    #Get category of input to better search for entity
    category = classify_input(get_knowledge_graph_df(input))
    #If person, just look for surname when trying to find facts
    if category == "Person":
        page_entity = input.split()[-1]
    else:
        page_entity = input
    #Above is fine

    doc = nlp(text)
    #Set up empty array to hold facts
    uniqueStatements = []
    #cue: Verb lemma with which ``entity`` is associated (e.g. "be", "have", "say").
    #Potentially use pronouns for the entity as well and combine results
    for cue in ["be", "have", "say", "do", "win", "write", "talk", "talk about", "born", "receive", "make", "continue", "find"]:
        statements = textacy.extract.semistructured_statements(doclike = doc, entity = page_entity, cue = cue)
        for statement in statements:
            entity, verb, fact = statement
            factlist = [str(word) for word in fact]
            fact = cleanup(str(fact))

            #Removing statements where target word/phrase appears in fact
            # if re.search(page_name, fact):
            #   continue
      
            # if len(answer_keyword_compare(factlist, page_name)) != \
            #    len(factlist):
            #    print(f'removed: {factlist} due to page_name {page_name} ')
            #if fact has answer word in it ignore
            #  continue
            if len(answer_keyword_compare(factlist, final_input)) != len(factlist):
                #if fact has final answer in it ignore 
                print(f'removed: {factlist} due to final_input {final_input} ')
                continue
            #Remove statements that are too long - more than 35 words long
            if len(fact.split()) > 35:
                continue
            elif len(fact.split()) < 5:
                continue
            # statement = f"{page_name} {verb} {fact}"
            statement = f"{verb} {fact}"
            #More cleanup on fact
            statement = re.sub(r"[\[\]\•\▽\❖\†]+", '', statement)
            statement = statement.replace(', ', ' ')
            statement = statement.replace(' , ', ', ')
            statement = statement.replace(' ( ', ' (')
            statement = statement.replace(' )', ')')
            statement = statement.replace(" 's", "'s")
            statement = statement.replace(" - ", "-")
            statement = statement.replace("\'s", "'s")
            statement = statement.replace(page_name, '______')
            statement = f"{page_name} {statement}"

            uniqueStatements.append(statement)
    num_statements = len(uniqueStatements)

    #If it can't find any facts, should we try splitting up the input in
    #a different manner - at moment, just telling us this is happening
    if len(uniqueStatements) == 0:
        return("No facts found for input.", num_statements)

    #Ensure code doesn't break if 2 facts are asked for but not available
    if len(uniqueStatements) == 1 and facts == 2:
        print('Only one fact available for answer.')
        facts = 1

    #Good tags for finding facts are numbers, proper nouns,
    #foreign words and comparative/superlative adjectives/adverbs
    good_tags = ['CD', 'FW', 'JJR', 'JJS', 'NNP', 'NNPS', 'RBR', 'RBRS']
    tag_count = []
    for i in range(len(uniqueStatements)):
        tag_tuples = nltk.pos_tag(uniqueStatements[i].split())
        tags = [x[1] for x in tag_tuples]
        #Adding a small weight for statement length to prioritise longer facts 
        #which should have more info
        tag_count.append(sum(x in good_tags for x in tags) + 0.3*len(tags))

    #Returning a sorted DataFrame of all the questions to be able to view
    df = pd.DataFrame(list(zip(uniqueStatements, tag_count)),
                      columns =['Statement', 'Count'])
    df = df.sort_values(by = 'Count', ascending = False)
#     return(df)

    #Get sorted array of indexes containing facts in ascending order of good tags
    #In case of a tie, this arrays puts the lower numbered index first
    sorted_count = sorted(range(len(tag_count)), key=lambda k: tag_count[k])

    #Use 2 facts with most good tags in them

    fact1 = uniqueStatements[sorted_count[-1]]
    if facts == 2:
        fact2 = uniqueStatements[sorted_count[-2]]

    #Calculate how many letters are in the answer we are blanking
    page_name_words = page_name
    words = page_name_words.split()
    letters_per_word = [len(w) for w in words]

    fact1 = fact1.replace(page_name, '______ ' + str(letters_per_word))
    if category == "Person" and facts == 2:
        fact2 = fact2.replace(page_name, 'This person')
    elif category != "Person" and facts == 2:
        fact2 = fact2.replace(page_name, 'It')

    if facts == 1:
        question = str('Fill in the blank: ') + fact1 + str('.')
    elif facts == 2:
        question = str('Fill in the blank: ') + fact1 + str('. ') + fact2 + str('.')

    return(question, num_statements)

In [20]:
def generate_card(input):
    '''Takes input and generates 4 question answer pairs'''
    keywords = linkee_keywords(input)
    answers = [] #Empty list to add answers we have questions for
    questions = []
    for answer in keywords:
        try:
            question, num_statements = fill_in_blank_q_generate(input, answer, facts = 2)
        #except PageError: 
        except: 
            print(answer + ' does not have a Wikipedia page')
            continue

        if question == 'No facts found for input.':
            print(answer + ' does not have facts')
            continue
        else:
            print(answer + ' does have facts')
            answers.append(answer)
            questions.append(question)
  
    for i in range(len(answers)):
        questions = [re.sub(answers[i], f"[keyword {i+1}]", qus) for qus in questions]
    print(answers,"; ",questions)
    return(answers, questions)

In [21]:
answers, questions = generate_card('Tom Hanks')

Academy Award
Academy Award does not have facts
Golden Globe
Golden Globe does not have facts
('Saving Private',)
failed with No urls found for Saving Private
Saving Private
Saving Private does not have facts
American actor
American actor does not have facts
Forrest Gump
removed: ['six', 'Academy', 'Awards', ':', 'Best', 'Picture', ',', 'Best', 'Director', ',', 'Best', 'Actor', 'for', 'Hanks', ',', 'Best', 'Adapted', 'Screenplay', ',', 'Best', 'Visual', 'Effects', ',', 'and', 'Best', 'Film', 'Editing'] due to final_input ['TOM', 'HANK', 'tom', 'toms', 'TOMS', 'Tom', 'hanks', 'Hank', 'Hanks', 'Toms', 'hank', 'HANKS'] 
Forrest Gump does have facts
('Lucky Guy', ['Lucky Guy (play)', 'Lucky Guy (musical)', '"Lucky Guy" (Dieter Bohlen song)', '"Lucky Guy" (Kim Hyun-joong song)', 'The Lucky Guy'])
failed with No urls found for Lucky Guy
Lucky Guy
Only one fact available for answer.
Lucky Guy does have facts
North America
North America does have facts
Beautiful Day
Beautiful Day does have fac

Looking at wiki page for: Ring_(jewellery)
Band
Band does not have facts
Spielberg
Spielberg does have facts
('Keatons alcoholic',)
failed with No urls found for Keatons alcoholic
('keating alcohol',)
failed with No urls found for Keatons alcoholic
No valid keyword
Keatons alcoholic
Keatons alcoholic does not have a Wikipedia page
Toy
Toy does not have facts
Emmy
Emmy does not have facts
HBO
HBO does have facts
['Forrest Gump', 'Lucky Guy', 'North America', 'Beautiful Day', 'Robert Langdon', 'Sheriff Woody', 'Night Live', 'Cloud Atlas', 'Kennedy Center', 'Joe Wright', 'Premier League', 'Apollo program', 'Tennessee Williams', 'Prime Minister', 'Chabot College', 'War', 'Kathleen Quinlan', 'Frank Abagnale', 'Extremely Loud', 'Moon', 'July', 'Spielberg', 'HBO'] ;  ['Fill in the blank: ______ [7, 4] is a 1994 American comedy-drama film directed by Robert Zemeckis and written by Eric Roth. It won Best Picture, Best Actor.', 'Fill in the blank: ______ [5, 3] is a play by Nora Ephron that prem

In [22]:
answers, questions

(['Forrest Gump',
  'Lucky Guy',
  'North America',
  'Beautiful Day',
  'Robert Langdon',
  'Sheriff Woody',
  'Night Live',
  'Cloud Atlas',
  'Kennedy Center',
  'Joe Wright',
  'Premier League',
  'Apollo program',
  'Tennessee Williams',
  'Prime Minister',
  'Chabot College',
  'War',
  'Kathleen Quinlan',
  'Frank Abagnale',
  'Extremely Loud',
  'Moon',
  'July',
  'Spielberg',
  'HBO'],
 ['Fill in the blank: ______ [7, 4] is a 1994 American comedy-drama film directed by Robert Zemeckis and written by Eric Roth. It won Best Picture, Best Actor.',
  'Fill in the blank: ______ [5, 3] is a play by Nora Ephron that premiered in 2013, the year after her death.',
  'Fill in the blank: ______ [5, 7] is the fourth most populous continent after Asia, Africa, and Europe. It is a very large continent that surpasses the Arctic Circle ,.',
  'Fill in the blank: ______ [9, 3] was a different song called " Always ", which was later released as a B-side. It is a song by Irish rock band U2.',
 

In [16]:
linkee_keywords('Walt Disney')

['World War',
 'Kansas City',
 'Gram Studio',
 'York Times',
 'American Broadcasting',
 'Los Angeles',
 'Achievement Awards',
 'Silly Symphony',
 'Mary Poppins',
 'Animated Short',
 'Chicago Academy',
 'Tomorrow EPCOT',
 'Experimental Prototype',
 'Company ABC',
 'Winter Olympics',
 'Golden Globe',
 'Jungle Book',
 'Happiest Millionaire',
 'Park School',
 'Pinocchio Fantasia',
 'Museum records',
 'Iwerks',
 'Bambi Dumbo',
 'Roy started',
 'Neal Gabler',
 'South America',
 'year theyre',
 'Thailands Order',
 'Pato Donald',
 'Construction work',
 'Virginia Davis',
 'Red Cross',
 'Mickey',
 'Rudolf Ising',
 'Carolwood Pacific',
 'Der König',
 'Playwright Robert',
 'Floyd Norman',
 'Arts colloquially',
 'Tripp Avenue',
 'Aztec Eagle',
 'Alfonso Cuaron',
 'Kenneth Branagh',
 'Lucky Rabbit',
 'Annette Funicello',
 'Stephan Jungk',
 'Movie Database',
 'White',
 'Kimball argues',
 'Laugh',
 'July',
 'Elias']

In [24]:
get_knowledge_graph('Tom Hank)

{'error': {'code': 400,
  'message': 'API key not valid. Please pass a valid API key.',
  'status': 'INVALID_ARGUMENT',
  'details': [{'@type': 'type.googleapis.com/google.rpc.ErrorInfo',
    'reason': 'API_KEY_INVALID',
    'domain': 'googleapis.com',
    'metadata': {'service': 'kgsearch.googleapis.com'}}]}}

In [38]:
fill_in_blank_q_generate('Tom Hanks', 'Disney', facts = 2)

past here Disney


Unnamed: 0,Statement,Count
0,Disney continued to produce cartoons with Mickey Mouse and other characters,6.3
1,Disney continued to focus its talents on television throughout the 1950s,4.3


In [35]:
pd.set_option('max_colwidth', 400)

In [33]:
question

'Statement'

In [27]:
get_knowledge_graph_df('Walt Disney')

Unnamed: 0,resultScore,@type,result.url,result.detailedDescription.license,result.detailedDescription.url,result.detailedDescription.articleBody,result.image.contentUrl,result.image.url,result.@type,result.@id,result.name,result.description
0,31381.007812,EntitySearchResult,http://disney.com,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/The_Walt_Disney_...,"The Walt Disney Company, commonly known as Dis...",https://encrypted-tbn2.gstatic.com/images?q=tb...,https://ar.m.wikipedia.org/wiki/%D9%85%D9%84%D...,"[Corporation, Organization, Thing, TheaterGroup]",kg:/m/09b3v,The Walt Disney Company,Entertainment company
1,15199.707031,EntitySearchResult,http://www.waltdisney.com,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/Walt_Disney,Walter Elias Disney was an American entreprene...,https://encrypted-tbn1.gstatic.com/images?q=tb...,https://commons.wikimedia.org/wiki/File:Walt_D...,"[Thing, Person]",kg:/m/081nh,Walt Disney,American entrepreneur
2,9252.871094,EntitySearchResult,http://movies.disney.com/the-lion-king,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/The_Lion_King,The Lion King is a 1994 American animated musi...,,,"[CreativeWork, Movie, Thing]",kg:/m/0m63c,The Lion King,1994 film
3,6824.843262,EntitySearchResult,http://movies.disney.com/the-little-mermaid,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/The_Little_Merma...,The Little Mermaid is a 1989 American animated...,https://encrypted-tbn2.gstatic.com/images?q=tb...,https://commons.wikimedia.org/wiki/File:The_Li...,"[CreativeWork, Movie, Thing]",kg:/m/01ry_x,The Little Mermaid,1989 film
4,6777.649902,EntitySearchResult,,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/Hayao_Miyazaki,"Hayao Miyazaki is a Japanese animator, directo...",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://commons.wikimedia.org/wiki/File:Hayao_...,"[Thing, Person]",kg:/m/0534v,Hayao Miyazaki,Japanese animation director
5,5211.307617,EntitySearchResult,,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/Lady_and_the_Tramp,Lady and the Tramp is a 1955 American animated...,,,"[CreativeWork, Movie, Thing]",kg:/m/01q3w7,Lady and the Tramp,1955 film
6,5121.74707,EntitySearchResult,http://movies.disney.com/cinderella,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/Cinderella_(1950...,Cinderella is a 1950 American animated musical...,https://encrypted-tbn2.gstatic.com/images?q=tb...,https://es.wikipedia.org/wiki/Archivo:1950_is_...,"[CreativeWork, Movie, Thing]",kg:/m/023p33,Cinderella,1950 film
7,3643.347412,EntitySearchResult,http://movies.disney.com/aladdin,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/Aladdin_(1992_Di...,Aladdin is a 1992 American animated musical fa...,,,"[CreativeWork, Movie, Thing]",kg:/m/0jnwx,Aladdin,1992 film
8,3165.55542,EntitySearchResult,http://www.disneyanimation.com/,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/Walt_Disney_Anim...,"Walt Disney Animation Studios, sometimes short...",https://encrypted-tbn1.gstatic.com/images?q=tb...,https://commons.wikimedia.org/wiki/File:Walt_D...,"[Organization, Thing, Corporation]",kg:/m/04rcl7,Walt Disney Animation Studios,Animation company
9,3002.821777,EntitySearchResult,http://movies.disney.com/,https://en.wikipedia.org/wiki/Wikipedia:Text_o...,https://en.wikipedia.org/wiki/Walt_Disney_Pict...,Walt Disney Pictures is an American film produ...,https://encrypted-tbn3.gstatic.com/images?q=tb...,https://ar.m.wikipedia.org/wiki/%D9%85%D9%84%D...,"[Organization, Thing, Corporation]",kg:/m/01795t,Walt Disney Pictures,Film studio
