In [1]:
import re
import math
import spacy
import wikipedia

from collections import Counter
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load('en', disable=['parser', 'ner', 'textcat'])
matched_phrases = []

In [3]:
def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start : end]
    matched_phrases.append(span.lemma_)

In [4]:
def extract_keywords_wikipedia(pagename, num_keywords):
    global matched_phrases
    page = wikipedia.page(pagename)
    pagenlp = nlp(page.content)
    matched_phrases = []
    matches = matcher(pagenlp)
    keywords = dict(Counter(matched_phrases).most_common(100))
    keywords_cvalues = {}
    for keyword in sorted(keywords.keys()):
        parent_terms = list(filter(lambda t: t != keyword and re.match('\\b%s\\b' % keyword, t), keywords.keys()))
        keywords_cvalues[keyword] = keywords[keyword]
        for pt in parent_terms:
            keywords_cvalues[keyword] -= float(keywords[pt])/float(len(parent_terms))
        keywords_cvalues[keyword] *= 1 + math.log(len(keyword.split()), 2)
    best_keywords = []
    for keyword in sorted(keywords_cvalues, key=keywords_cvalues.get, reverse=True)[:num_keywords]:
        best_keywords.append([keyword, keywords_cvalues[keyword]])
    return best_keywords

In [5]:
patterns = [[{'POS': 'NOUN', 'IS_ALPHA': True, 'IS_STOP': False, 'OP': '+'}]]
matcher = Matcher(nlp.vocab)
for pattern in patterns:
    matcher.add('keyword', collect_sents, pattern)

In [6]:
extract_keywords_wikipedia("New York City", 10)

[['city', 232.0],
 ['world', 79.0],
 ['population', 56.0],
 ['area', 52.0],
 ['system', 40.0],
 ['home', 37.0],
 ['park', 35.0],
 ['year', 28.0],
 ['center', 23.0],
 ['immigrant', 23.0]]

In [7]:
extract_keywords_wikipedia("Mad Max", 10)

[['film', 41.0],
 ['car', 17.0],
 ['gang', 10.0],
 ['time', 8.0],
 ['vehicle', 8.0],
 ['week', 8.0],
 ['member', 7.0],
 ['scene', 7.0],
 ['budget', 6.0],
 ['production', 6.0]]

In [10]:
extract_keywords_wikipedia("Asus", 10)

[['pc', 18.0],
 ['company', 17.0],
 ['computer', 17.0],
 ['series', 16.0],
 ['card', 12.0],
 ['monitor', 10.0],
 ['line', 9.0],
 ['product', 9.0],
 ['smartphone', 9.0],
 ['device', 8.0]]