# Article to Fill in the Blank

### Step 1. Setup
* imports 
* load english language library
* Build list of stopwords.  Stopwords = words that dont add much meaning to a sentence
* Load the text into spaCy NLP

In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

nlp = spacy.load("en_core_web_sm")

# Build a List of Stopwords
stopwords = list(STOP_WORDS)

text = "The Cleveland Cavaliers have hired University of California, Berkeley womens basketball head coach Lindsay Gottlieb to be an assistant on first-year head coach John Beileins staff, the team announced Wednesday.\n\nIts a historic hire for the NBA, as Gottlieb is the first womens collegiate head coach to be recruited to a teams staff, according to ESPN.\n\nShe is expected to have a role comparable to only two female coaching peers in the league, per ESPN: San Antonio Spurs assistantBecky Hammon and formerSacramento Kings assistant Nancy Lieberman.\n\nThe vision for the Cavs future is compelling and I look forward to helping make it a reality, Gottlieb said in a statement.At the same time, on a personal level, I am honored to hopefully impact young girls and women to be empowered to pursue their own visions and to be inspired to turn them into reality as well.\n\n\n\nGottliebtook the Golden Bearsto the programs first-ever Elite Eight and Final Four in 2013 andled Cal to a combined 179-89 record during hereight years as head coach, including seven NCAA tournament appearances.\n\nBeilein, who was hired in May after 12 seasons as head coach at the University of Michigan, said Gottlieb truly values and embraces player development and a culture of winning basketball habits.\n\nHer success at Cal Berkeley speaks for itself and her insight in our meetings, practices and games will hold tremendous value. After sitting with her, it was easy to see how she will connect quickly with our staff and our players, and we all benefit because of that connection. Im looking forward to merging all of her years of experience and vision for the game with our current and future coaching staff.\n\nThe Cavaliers are coming off one of the worst seasons in franchise history, but they have some young, talented players to build around, as well as the fifth overall pick in the June 20 draft."
text = text.rstrip()

doc = nlp(text)

### Step 2. Rank Sentences 
* rank sentences by word_frequencies (stop words not included)
* take the 5 highest ranked sentences

In [5]:
# Build Word Frequency
# word.text is tokenization in spacy
word_frequencies = {}
for word in doc:
    if word.text not in stopwords:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1
                
# Maximum Word Frequency
maximum_frequency = max(word_frequencies.values())

for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

sentence_list = [ sentence for sentence in doc.sents ]

# Sentence Score via comparrng each word with sentence
sentence_scores = {}  
for sent in sentence_list:  
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if len(sent.text.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]

from heapq import nlargest

summarized_sentences = nlargest(5, sentence_scores, key=sentence_scores.get)

# Step 3. Build Questions
* group words into classes.  PEOPLE ORG etc.
* transform each sentence into its fill in the blank version

In [8]:
options = set()

optionsMap = {}

for entity in doc.ents:
    if(not options.__contains__(entity.text.lower())):
        options.add(entity.text.lower())
        if(entity.label_ in optionsMap):
            optionsMap[entity.label_].append(entity.text)
        else:
            optionsMap[entity.label_] = [entity.text]

final_sentences = [ w.text for w in summarized_sentences ]

outputjson = {}

count = 1
for sentence in final_sentences:
    count+=1
    sent_doc = nlp(sentence)
    isFull = True
    for entity in sent_doc.ents:  
        if(entity.label_ == 'PERSON') and isFull:
            blank = ''
            for count in range(len(entity.text)):
                blank += '_'
            
            fullOptions = optionsMap[entity.label_]
            outputjson[entity.text] = { 'question': sentence.replace(entity.text, blank, 1), 'options': fullOptions}
            isFull = False
    for entity in sent_doc.ents: 
        if(entity.label_ == 'ORG') and isFull:
            blank = ''
            for count in range(len(entity.text)):
                blank += '_'
            fullOptions = optionsMap[entity.label_]
            outputjson[entity.text] = { 'question': sentence.replace(entity.text, blank, 1), 'options': fullOptions}
            isFull = False

import pprint
pprint.pprint(outputjson)

{'Cal Berkeley': {'options': ['The Cleveland Cavaliers',
                              'University of California, Berkeley',
                              'NBA',
                              'ESPN',
                              'Cavs',
                              'NCAA',
                              'the University of Michigan',
                              'Cal Berkeley'],
                  'question': 'Her success at ____________ speaks for itself '
                              'and her insight in our meetings, practices and '
                              'games will hold tremendous value.'},
 'Gottlieb': {'options': ['Lindsay Gottlieb',
                          'John Beileins',
                          'Spurs',
                          'Hammon',
                          'Nancy Lieberman',
                          'Beilein'],
              'question': 'The vision for the Cavs future is compelling and I '
                          'look forward to helping make it a realit