In [1]:
import numpy as np
import pandas as pd

## Words

In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')

def addWordsForNewParagraph(newWords, text):
    doc = nlp(text)

    neStarts = getNEStartIndexes(doc)
    sentenceStarts = getSentenceStartIndexes(doc)
    
    i = 0
    
    while(i<len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            currentSentence = getSentenceForWordPosition(word.start, sentenceStarts)
            wordLength = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)
                
            newWords.append([word.text, 0, 0, currentSentence, wordLength, word.label_, None, None, None, shape])
            
            i = neStarts[i].end - 1
        else:
            #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]
                currentSentence = getSentenceForWordPosition(i, sentenceStarts)
                wordLength = 1
                newWords.append([word.text, 0, 0, currentSentence, wordLength, None, word.pos_, word.tag_, word.dep_, word.shape_])
        i += 1

def addWordsForParagraph(df, newWords, titleId, paragraphId):
    text = df['data'][titleId]['paragraphs'][paragraphId]['context']
    qas = df['data'][titleId]['paragraphs'][paragraphId]['qas']
    
    doc = nlp(text)
    
    answers = extractAnswers(qas, doc)
    neStarts = getNEStartIndexes(doc)
    sentenceStarts = getSentenceStartIndexes(doc)
    
    i = 0
    
    while(i<len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            currentSentence = getSentenceForWordPosition(word.start, sentenceStarts)
            wordLength = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)
                
            newWords.append([word.text, tokenIsAnswer(word.text, currentSentence, answers), titleId, paragraphId, currentSentence, wordLength, word.label_, None, None, None, shape])
            
            i = neStarts[i].end - 1
        else:
            #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]
                currentSentence = getSentenceForWordPosition(i, sentenceStarts)
                wordLength = 1
                newWords.append([word.text, tokenIsAnswer(word.text, currentSentence, answers), titleId, paragraphId, currentSentence, wordLength, None, word.pos_, word.tag_, word.dep_, word.shape_])
        i += 1

def getSentenceForWordPosition(wordPos, sentenceStarts):
    for i in range(1, len(sentenceStarts)):
        if (wordPos < sentenceStarts[i]):
            return i - 1

def getNEStartIndexes(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts

def getSentenceStartIndexes(doc):
    sentenceStarts = []
    
    for sentence in doc.sents:
        sentenceStarts.append(sentence[0].i)
    
    return sentenceStarts

def extractAnswers(qas, doc):
    answers = []
    
    sentenceStart = 0
    sentenceId = 0
    
    for sentence in doc.sents:
        sentenceLength = len(sentence.text)
        
        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']
            if (answerStart >= sentenceStart and answerStart < (sentenceStart + sentenceLength)):
                answers.append({'sentenceId': sentenceId, 'text': answer['answers'][0]['text']})
                
        sentenceStart += sentenceLength
        sentenceId += 1
                    
    return answers

def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                ret

## Preprocessing

In [3]:
# Predict whether a word is a keyword
def generateDf(text):
    newWords = []
    addWordsForNewParagraph(newWords, text)

    wordColums = ['text', 'titleId', 'paragraphId', 'sentenceId', 'wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
    df = pd.DataFrame(newWords, columns=wordColums)
    return df

def prepareDf(df):
    wordsDf = encodeAndDropColumns(df)

    #Add missing colums 
    predictorColumns = ['wordCount',
 'NER_CARDINAL',
 'NER_DATE',
 'NER_EVENT',
 'NER_FAC',
 'NER_GPE',
 'NER_LANGUAGE',
 'NER_LAW',
 'NER_LOC',
 'NER_MONEY',
 'NER_NORP',
 'NER_ORDINAL',
 'NER_ORG',
 'NER_PERCENT',
 'NER_PERSON',
 'NER_PRODUCT',
 'NER_QUANTITY',
 'NER_TIME',
 'NER_WORK_OF_ART',
 'POS_ADJ',
 'POS_ADP',
 'POS_ADV',
 'POS_CCONJ',
 'POS_DET',
 'POS_INTJ',
 'POS_NOUN',
 'POS_NUM',
 'POS_PART',
 'POS_PRON',
 'POS_PROPN',
 'POS_PUNCT',
 'POS_SCONJ',
 'POS_SYM',
 'POS_VERB',
 'POS_X',
 "TAG_''",
 'TAG_,',
 'TAG_.',
 'TAG_ADD',
 'TAG_AFX',
 'TAG_CC',
 'TAG_CD',
 'TAG_DT',
 'TAG_EX',
 'TAG_FW',
 'TAG_IN',
 'TAG_JJ',
 'TAG_JJR',
 'TAG_JJS',
 'TAG_LS',
 'TAG_MD',
 'TAG_NN',
 'TAG_NNP',
 'TAG_NNPS',
 'TAG_NNS',
 'TAG_PDT',
 'TAG_POS',
 'TAG_PRP',
 'TAG_PRP$',
 'TAG_RB',
 'TAG_RBR',
 'TAG_RBS',
 'TAG_RP',
 'TAG_SYM',
 'TAG_TO',
 'TAG_UH',
 'TAG_VB',
 'TAG_VBD',
 'TAG_VBG',
 'TAG_VBN',
 'TAG_VBP',
 'TAG_VBZ',
 'TAG_WDT',
 'TAG_WP',
 'TAG_WRB',
 'TAG_XX',
 'DEP_ROOT',
 'DEP_acl',
 'DEP_acomp',
 'DEP_advcl',
 'DEP_advmod',
 'DEP_agent',
 'DEP_amod',
 'DEP_appos',
 'DEP_attr',
 'DEP_aux',
 'DEP_auxpass',
 'DEP_cc',
 'DEP_ccomp',
 'DEP_compound',
 'DEP_conj',
 'DEP_csubj',
 'DEP_csubjpass',
 'DEP_dative',
 'DEP_dep',
 'DEP_det',
 'DEP_dobj',
 'DEP_intj',
 'DEP_mark',
 'DEP_meta',
 'DEP_neg',
 'DEP_nmod',
 'DEP_npadvmod',
 'DEP_nsubj',
 'DEP_nsubjpass',
 'DEP_nummod',
 'DEP_oprd',
 'DEP_parataxis',
 'DEP_pcomp',
 'DEP_pobj',
 'DEP_poss',
 'DEP_predet',
 'DEP_prep',
 'DEP_prt',
 'DEP_punct',
 'DEP_quantmod',
 'DEP_relcl',
 'DEP_xcomp']

    for feature in predictorColumns:
        if feature not in wordsDf.columns:
            wordsDf[feature] = 0

    return wordsDf

def oneHotEncodeColumns(df):
    columnsToEncode = ['NER', 'POS', "TAG", 'DEP']

    for column in columnsToEncode:
        one_hot = pd.get_dummies(df[column])
        one_hot = one_hot.add_prefix(column + '_')

        df = df.drop(column, axis = 1)
        df = df.join(one_hot)

    return df

def encodeAndDropColumns(df):
    # One hot encoding
    wordsDf = oneHotEncodeColumns(df)

    #Drop unused columns
    columnsToDrop = ['text', 'titleId', 'paragraphId', 'sentenceId', 'shape']
    wordsDf = wordsDf.drop(columnsToDrop, axis = 1)

    return wordsDf

## Predictor

In [4]:
import pickle
from pathlib import Path

def pickle_exists(filename):
    file = Path('../models/' + filename + '.pkl')
    if file.is_file():
        return True
    return False

def save_model(model, filename):
    pickle.dump(model, open('../models/' + filename + '.pkl', 'wb'))
    
def load_model(filename):
    return pickle.load(open('../models/' + filename + '.pkl', 'rb'))

def predictWords(wordsDf, df, model):
    predictor = load_model(model)

    y_pred = predictor.predict(wordsDf)
    
    labeledAnswers = []
    for i in range(len(y_pred)):
        labeledAnswers.append({'word': df.iloc[i]['text'], 'prob': y_pred[i]})
    
    return labeledAnswers

## Question Generator

In [5]:
def addQuestions(answers, text):
    doc = nlp(text)
    currentAnswerIndex = 0
    qaPair = []

    #Check wheter each token is the next answer
    for sent in doc.sents:
        for token in sent:
            
            #If all the answers have been found, stop looking
            if currentAnswerIndex >= len(answers):
                break
            
            #In the case where the answer is consisted of more than one token, check the following tokens as well.
            answerDoc = nlp(answers[currentAnswerIndex]['word'])
            answerIsFound = True
            
            for j in range(len(answerDoc)):
                if token.i + j >= len(doc) or doc[token.i + j].text != answerDoc[j].text:
                    answerIsFound = False
           
            #If the current token is corresponding with the answer, add it 
            if answerIsFound:
                question = blankAnswer(token.i, token.i + len(answerDoc) - 1, sent.start, sent.end, doc)
                
                qaPair.append({'question' : question, 'answer': answers[currentAnswerIndex]['word'], 'prob': answers[currentAnswerIndex]['prob']})
                
                currentAnswerIndex += 1
                
    return qaPair


def blankAnswer(firstTokenIndex, lastTokenIndex, sentStart, sentEnd, doc):
    leftPartStart = doc[sentStart].idx
    leftPartEnd = doc[firstTokenIndex].idx
    rightPartStart = doc[lastTokenIndex].idx + len(doc[lastTokenIndex])
    rightPartEnd = doc[sentEnd - 1].idx + len(doc[sentEnd - 1])
    
    question = doc.text[leftPartStart:leftPartEnd] + '_____' + doc.text[rightPartStart:rightPartEnd]
    
    return question

def sortAnswers(qaPairs):
    orderedQaPairs = sorted(qaPairs, key=lambda qaPair: qaPair['prob'])
    return orderedQaPairs

## Distractor Generator

In [6]:
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

glove_file = '../data/embeddings/glove.6B.300d.txt'
tmp_file = '../data/embeddings/word2vec-glove.6B.300d.txt'

from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

def generate_distractors(answer, count):
    answer = str.lower(answer)

    ##Extracting closest words for the answer. 
    try:
        closestWords = model.most_similar(positive=[answer], topn=count)
    except:
        #In case the word is not in the vocabulary, or other problem not loading embeddings
        return []

    #Return count many distractors
    distractors = list(map(lambda x: x[0], closestWords))[0:count]
    
    return distractors

def addDistractors(qaPairs, count):
    for qaPair in qaPairs:
        distractors = generate_distractors(qaPair['answer'], count)
        qaPair['distractors'] = distractors
    
    return qaPairs

## Sentence Ranker

In [7]:
import re
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')

stop_words = stopwords.words('english')

def rearrangeByRank(text):
    sentences = sent_tokenize(text)
    ranked_sentences = getRankedSentences(sentences)
    rearrangedByRank = []
    for sentence in ranked_sentences[:10]:
        rearrangedByRank.append(sentence[1])
    return ' '.join(rearrangedByRank)

def cleanSentences(sentences):
    # remove punctuations, numbers and special characters
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

    # make alphabets lowercase
    clean_sentences = [s.lower() for s in clean_sentences]

    return [remove_stopwords(r.split()) for r in clean_sentences]

# function to remove stopwords
def remove_stopwords(sentence):
    sen_new = " ".join([i for i in sentence if i not in stop_words])
    return sen_new


def extractWordVectors():
    word_embeddings = {}
    f = open('../data/embeddings/glove.6B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    return word_embeddings

def getRankedSentences(sentences):
    sim_mat = getSimilarityMatrix(sentences)
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

def getSimilarityMatrix(sentences):
    clean_sentences = cleanSentences(sentences)
    sentence_vectors = getSentenceVectors(clean_sentences)
    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    return sim_mat

def getSentenceVectors(clean_sentences):
    word_embeddings = extractWordVectors()
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    return sentence_vectors

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kshitijbajracharya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kshitijbajracharya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Train

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

squadPath = '../data/squad-v1/'
datasetFile = '../data/squad-v1/squad.csv'

def getProcessedSquad():
    df = getDf(squadPath)

    file = Path(datasetFile)

    if file.is_file():
        print("File exists, loading from file")
        squad = pd.read_csv(datasetFile)
    else:
        print ("File doesn't exist. Creating...")
        
        words = []
        wordColumns = ['text', 'isAnswer', 'titleId', 'paragraphId', 'sentenceId', 'wordCount', 'NER', 'POS', 'TAG', 'DEP', 'shape']
        titlesCount = len(df['data'])
        
        count = 0
        for titleId in range(titlesCount):
            paragraphsCount = len(df['data'][titleId]['paragraphs'])
            for paragraphId in range(paragraphsCount):
                wd.addWordsForParagraph(df, words, titleId, paragraphId)
                count += 1
                if (count%1000 == 0):
                    print(count)
                    
        squad = pd.DataFrame(words, columns=wordColumns)
        squad.to_csv(datasetFile, index=False)
        print('100% done and written to file')

    return squad

def getDf(path):
    train = pd.read_json(squadPath + 'train-v1.1.json', orient='column')
    dev = pd.read_json(squadPath + 'dev-v1.1.json', orient='column')

    return pd.concat([train, dev], ignore_index=True)

def start(modelFile, classifier):
    if pickle_exists(modelFile):
        print('\nPickle already exists. Loading from file ' + modelFile + '.pkl')
        model = load_model(modelFile)
    else:
        print('\nNo pickle found. Training model and saving to file ' + modelFile + '.pkl')

        df = getProcessedSquad()
        df = pp.encodeAndDropColumns(df)

        x_data = df.drop(labels=['isAnswer'], axis=1)
        y_data = df['isAnswer']
        x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=4)

        model = classifier.fit(x_train, y_train)
        save_model(model, modelFile)

        y_pred = model.predict(x_test)
        correctCount = (y_test == y_pred).sum()
        print('Correctly guessed:', '{:.2f}%'.format((correctCount / len(y_test)) * 100))

start('gaussian_naive_bayes', GaussianNB())
start('logistic_regression', LogisticRegression())


Pickle already exists. Loading from file gaussian_naive_bayes.pkl

Pickle already exists. Loading from file logistic_regression.pkl


## Generate Questions

In [9]:
models = ['gaussian_naive_bayes', 'logistic_regression']

In [10]:
def generateQuestions(text, count):
    # Extract words
    text = rearrangeByRank(text)
    df = generateDf(text)
    wordsDf = prepareDf(df)

    for model in models:
        print("\n#####################################\n")
        print(model)
        print("\n#####################################\n")

        # Predict
        labeledAnswers = predictWords(wordsDf, df, model)

        # Transform questions
        qaPairs = addQuestions(labeledAnswers, text)

        # Pick the best questions
        orderedQaPairs = sortAnswers(qaPairs)

        # Generate distractors
        questions = addDistractors(orderedQaPairs[:count], 3)

        # Print
        print('Text:')
        print(text + '\n')
        for i in range(count):
            print('Question ' + str(i + 1) + ':')
            print(questions[i]['question'])

            print('Answer:')
            print(questions[i]['answer'])
            
            print('Incorrect answers:')
            for distractor in questions[i]['distractors']:
                print(distractor)
            
            print()

In [11]:
text = "Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. As compounds including oxides, the element makes up almost half of the Earth's crust."

generateQuestions(text, 10)


#####################################

gaussian_naive_bayes

#####################################

Text:
At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. As compounds including oxides, the element makes up almost half of the Earth's crust. Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere.

Question 1:
At _____ temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2.
Answer:
standard
Incorrect answers:
standards
basic
system

Question 2:
At standard _____ and

In [12]:
text2 = "Summary Changes of state are examples of phase changes, or phase transitions. All phase changes are accompanied by changes in the energy of a system. Changes from a more-ordered state to a less-ordered state (such as a liquid to a gas) areendothermic. Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always exothermic. The conversion of a solid to a liquid is called fusion (or melting). The energy required to melt 1 mol of a substance is its enthalpy of fusion (\u0394Hfus). The energy change required to vaporize 1 mol of a substance is the enthalpy of vaporization (\u0394Hvap). The direct conversion of a solid to a gas is sublimation. The amount of energy needed to sublime 1 mol of a substance is its enthalpy of sublimation (\u0394Hsub) and is the sum of the enthalpies of fusion and vaporization. Plots of the temperature of a substance versus heat added or versus heating time at a constant rate of heating are calledheating curves. Heating curves relate temperature changes to phase transitions. A superheated liquid, a liquid at a temperature and pressure at which it should be a gas, is not stable. A cooling curve is not exactly the reverse of the heating curve because many liquids do not freeze at the expected temperature. Instead, they form a supercooled liquid, a metastable liquid phase that exists below the normal melting point. Supercooled liquids usually crystallize on standing, or adding a seed crystal of the same or another substance can induce crystallization."

generateQuestions(text2, 10)


#####################################

gaussian_naive_bayes

#####################################

Text:
Instead, they form a supercooled liquid, a metastable liquid phase that exists below the normal melting point. A cooling curve is not exactly the reverse of the heating curve because many liquids do not freeze at the expected temperature. The conversion of a solid to a liquid is called fusion (or melting). The energy required to melt 1 mol of a substance is its enthalpy of fusion (ΔHfus). Plots of the temperature of a substance versus heat added or versus heating time at a constant rate of heating are calledheating curves. All phase changes are accompanied by changes in the energy of a system. A superheated liquid, a liquid at a temperature and pressure at which it should be a gas, is not stable. Heating curves relate temperature changes to phase transitions. Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always exothermic. The direct c

In [13]:
text3 = "One way to keep iron from corroding is to keep it painted. The layer of paint prevents the water and oxygen necessary for rust formation from coming into contact with the iron. As long as the paint remains intact, the iron is protected from corrosion. Other strategies include alloying the iron with other metals. For example, stainless steel is mostly iron with a bit of chromium. The chromium tends to collect near the surface, where it forms an oxide layer that protects the iron. Zinc-plated or galvanized iron uses a different strategy. Zinc is more easily oxidized than iron because zinc has a lower reduction potential. Since zinc has a lower reduction potential, it is a more active metal. Thus, even if the zinc coating is scratched, the zinc will still oxidize before the iron. This suggests that this approach should work with other active metals. Another important way to protect metal is to make it the cathode in a galvanic cell. This is cathodic protection and can be used for metals other than just iron. For example, the rusting of underground iron storage tanks and pipes can be prevented or greatly reduced by connecting them to a more active metal such as zinc or magnesium (Figure 17.18). This is also used to protect the metal parts in water heaters. The more active metals (lower reduction potential) are called sacrificial anodes because as they get used up as they corrode (oxidize) at the anode. The metal being protected serves as the cathode, and so does not oxidize (corrode). When the anodes are properly monitored and periodically replaced, the useful lifetime of the iron storage tank can be greatly extended."

generateQuestions(text3, 10)


#####################################

gaussian_naive_bayes

#####################################

Text:
For example, the rusting of underground iron storage tanks and pipes can be prevented or greatly reduced by connecting them to a more active metal such as zinc or magnesium (Figure 17.18). The more active metals (lower reduction potential) are called sacrificial anodes because as they get used up as they corrode (oxidize) at the anode. Thus, even if the zinc coating is scratched, the zinc will still oxidize before the iron. Since zinc has a lower reduction potential, it is a more active metal. Zinc-plated or galvanized iron uses a different strategy. This is also used to protect the metal parts in water heaters. For example, stainless steel is mostly iron with a bit of chromium. The layer of paint prevents the water and oxygen necessary for rust formation from coming into contact with the iron. The chromium tends to collect near the surface, where it forms an oxide layer that prot