# Paper Correction
This module is to create a model from a key answer and use it to score other answers

In [747]:
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models import KeyedVectors
import pytextrank
from rake_nltk import Rake
import re, string
import en_core_web_lg
import time
from math import *

# Initialising preprocessing metrics

In [748]:
lemmatizer = WordNetLemmatizer()
stopw = set(stopwords.words('english'))
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
sp = en_core_web_lg.load()
r = Rake()
textrank = pytextrank.TextRank()
sp.add_pipe(textrank.PipelineComponent, name="textrank", last=True)
model = KeyedVectors.load_word2vec_format("GoogleModel.bin", binary=True)

In [780]:
contraction = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot", 
    "can't've": "cannot have", 
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
    "hadn't've": "had not have", 
    "hasn't": "has not",
    "haven't": "have not", 
    "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
    "he'll've": "he he will have", "he's": "he is", "how'd": "how did", 
    "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
    "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
    "I'll've": "I will have","I'm": "I am", "I've": "I have", 
    "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
    "i'll've": "i will have","i'm": "i am", "i've": "i have", 
    "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
    "it'll": "it will", "it'll've": "it will have","it's": "it is", 
    "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
    "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
    "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
    "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
    "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
    "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
    "she's": "she is", "should've": "should have", "shouldn't": "should not", 
    "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
    "this's": "this is",
    "that'd": "that would", "that'd've": "that would have","that's": "that is", 
       "there'd": "there would", "there'd've": "there would have","there's": "there is", 
       "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
       "they'll've": "they will have", "they're": "they are", "they've": "they have", 
       "to've": "to have", "wasn't": "was not", "we'd": "we would", 
       "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
       "we're": "we are", "we've": "we have", "weren't": "were not", 
       "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
       "what's": "what is", "what've": "what have", "when's": "when is", 
       "when've": "when have", "where'd": "where did", "where's": "where is", 
       "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
       "who's": "who is", "who've": "who have", "why's": "why is", 
       "why've": "why have", "will've": "will have", "won't": "will not", 
       "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
       "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
       "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
       "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
       "you'll've": "you will have", "you're": "you are", "you've": "you have"}

def clean(text):
    text = text.lower()
    temp = ""
    for i in text.split():
        try:
            temp+=contraction[i]+' '
        except:
            temp+= i+' '
    text = temp.strip()
    text = text.lower().translate(remove_punctuation_map)
    text = re.sub("[^a-zA-Z#]"," ",text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r",", "", text)
    text = re.sub(r"\.", "", text)
    text = re.sub(r"!", "!", text)
    text = re.sub(r"\/", "", text)
    text = re.sub(r"'", "", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", ":", text)
    text = re.sub(r' +',' ',text)
    return text.strip()

def stopwordremoval(text):
    text = word_tokenize(text)
    text = [i for i in text if i not in stopw]
    return " ".join(text)
        

### Question :
What is machine learning?
### Expected Answer : 
Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions, relying on patterns and inference instead. It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or infeasible to develop a conventional algorithm for effectively performing the task.

### Alternate
Machine learning is defined as a computer system that performs a specific task without using explicit instructions. Artificial intelligence is a parent of machine learning. We use training data to build a mathematical model using machine learning algorithms. Machine learning algorithms are used in a variety of applications, such as, email filtering and computer vision.

In [849]:
with open("ans.txt", "r") as f:
    test_ans = f.read().strip()
key_ml = '''Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions, relying on patterns and inference instead. It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or infeasible to develop a conventional algorithm for effectively performing the task.'''
key_mit = '''The mitochondrion is an organelle found in large numbers in most cells, in which the biochemical processes of respiration and energy production occur. It has a double membrane, the inner part being folded inwards to form cristae.'''

In [850]:
start_time = time.time()

# Tokenizing Answer

In [851]:
def pp_set(text, op):
    key_tokenized_sentences = sent_tokenize(text)
    key_tokenized_words = word_tokenize(text)
    if op == "token_sent":
        return key_tokenized_sentences
    elif op == "token_word":
        return key_tokenized_words
    elif op == "clean_sent":
        return [clean(i) for i in key_tokenized_sentences]
    elif op == "clean_word":
        return [clean(i) for i in key_tokenized_words]
    elif op == "lem_sent":
        key_clean_sentences = pp_set(text, "clean_sent")
        return [" ".join([lemmatizer.lemmatize(j) for j in i.split()]) for i in key_clean_sentences]
    elif op == "lem_word":
        key_clean_words = pp_set(text, "clean_word")
        return [lemmatizer.lemmatize(i) for i in key_clean_words]
    elif op == "prep_sent":
        key_clean_sentences = pp_set(text, "clean_sent")
        return [" ".join([i for i in j.split() if i not in stopw]) for j in key_clean_sentences]
    elif op == "prep_word":
        key_preprocessed_sentences = pp_set(text, "prep_sent")
        key_preprocessed_words = []
        for i in key_preprocessed_sentences:
            key_preprocessed_words.extend(word_tokenize(i))
        return key_preprocessed_words
    elif op == "pp_lem_word":
        return [lemmatizer.lemmatize(i) for i in pp_set(text, "prep_word")]

# Semantic Similarity

In [852]:
def avg_sentence_vector(words, model, num_features, index2word_set):
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec.reshape(1,-1)

In [853]:
def semantic_sim(key1, key2):
    vec1 = avg_sentence_vector(pp_set(key1,"pp_lem_word"), model, 300, model.index2word)
    vec2 = avg_sentence_vector(pp_set(key2,"pp_lem_word"), model, 300, model.index2word)
    sim1 = cosine_similarity(vec1, vec2)[0][0]
#     sim2 = 1/(1+model.wmdistance(key1, key2))
#     print(sim1, sim2)
    return sim1

# Extracting Keywords

In [854]:
def extract_keywords(text):
    r.extract_keywords_from_sentences(pp_set(text, "lem_sent")) # Effectiveness : tokenized > lemmatized > clean 
    rake_keywords = r.get_ranked_phrases()
    spdoc = sp(text)
    ner_keywords = []
    for ent in spdoc.ents:
        ner_keywords.append(ent.text)
    spdoc = sp(" ".join(pp_set(text, "clean_word")))
    pytr_keywords = []
    for p in spdoc._.phrases:
        for term in p.chunks:
            if term.text not in pytr_keywords and term.text not in stopw:
                x = term.text
                pytr_keywords.append(x)
                
    all_keywords = rake_keywords+pytr_keywords+ner_keywords
    all_keywords = list(set(all_keywords))
    sorted_keywords = list(all_keywords)
    sorted_keywords.sort()
    for i in range(len(sorted_keywords)):
        sorted_keywords[i] = re.sub(r' +',' ',sorted_keywords[i])
    
    return sorted_keywords

## Using spaCy Noun Chunks

In [855]:
# spdoc = sp(" ".join(key_lemmatized_sentences))
# nounchunk_keywords = list(set([i.text for i in set(spdoc.noun_chunks) if i.text not in stopw and len(min(i.text.split(),key = len))>1]))
#nounchunk_keywords = list(set([i.text for i in set(spdoc.noun_chunks) if i.text not in stopw]))
#nounchunk_keywords

## Using N-Grams and Tf-Idf

In [856]:
# bigrams = list(ngrams(key_clean_words,2))
# trigrams = list(ngrams(key_clean_words,3))
# quadgrams = list(ngrams(key_clean_words,4))
# pentagrams = list(ngrams(key_clean_words,5))

In [857]:
# quadgrams = list(set([" ".join([j for j in i if j not in stopw]).strip() for i in quadgrams]))
# quadgrams = [i for i in quadgrams if i not in stopw and i!='' and len(i.split())>1]
#print(quadgrams)

In [858]:
# ngram_vector_key = dict()
# for i in quadgrams:
#     ngram_vector_key[i] = [0 for i in range(len(key_clean_sentences))]
# for i in range(len(key_clean_sentences)):
#     for phrase in ngram_vector_key:
#         ngram_vector_key[phrase][i] = (key_clean_sentences[i].count(phrase)/len(word_tokenize(key_clean_sentences[i])))
#         df = 0
#         for j in key_clean_sentences:
#             if phrase in j:
#                 df+=1
#         ngram_vector_key[phrase][i]*=(1+log((len(key_clean_sentences)+1)/(df+1)))

In [859]:
# ngram_keywords = sorted(ngram_vector_key,key = lambda x:sum(ngram_vector_key[x]),reverse = True)
#ngram_keywords

# Grouping Keywords

In [860]:
def group(sorted_keywords):
    grouped_keys = []
    for i in sorted_keywords:
        if len(grouped_keys)==0:
            grouped_keys.append([i])
            continue
        else:
            flag = False
            for j in grouped_keys:
                if i in j:
                    flag = True
                    break
                temp1 = " ".join([lemmatizer.lemmatize(t) for t in stopwordremoval(i).split()])
                for k in j:
                    temp2 = " ".join([lemmatizer.lemmatize(t) for t in stopwordremoval(k).split()])
                    short = min(temp1,temp2)
                    long = max(temp1,temp2)
                    if short in long:
                        flag = True
                        j.append(i)
                        break
                if flag == True:
                    break            
            if flag==False:
                grouped_keys.append([i])
    temp = []
    for i in grouped_keys:
        k = sorted(i,key = len)
        temp.append(k)
    return temp

In [861]:
grouped_keys = group(extract_keywords(key_ml))
grouped_keys

[['algorithm',
  'conventional algorithm',
  'a conventional algorithm',
  'machine learning algorithm',
  'machine learning algorithms',
  'machine learning algorithm build',
  'algorithms and statistical models'],
 ['a mathematical model', 'mathematical model based'],
 ['a specific task',
  'specific task without using explicit instruction relying'],
 ['subset', 'a subset'],
 ['wide variety', 'a wide variety'],
 ['application', 'applications'],
 ['artificial intelligence'],
 ['computer system use'],
 ['computer vision'],
 ['decisions', 'decision without'],
 ['develop'],
 ['difficult'],
 ['effectively performing'],
 ['email filtering'],
 ['explicit instructions'],
 ['explicitly programmed'],
 ['infeasible'],
 ['inference', 'inference instead'],
 ['machine learning ml'],
 ['make prediction'],
 ['order'],
 ['pattern', 'patterns'],
 ['perform'],
 ['predictions'],
 ['sample data', 'sample data known'],
 ['scientific study', 'the scientific study'],
 ['seen'],
 ['statistical model'],
 ['ta

# Removing Duplicates - longer match

In [862]:
def remove_duplicates(grouped_keys):    
    for i in range(len(grouped_keys)):
        grouped_keys[i] = list(set(grouped_keys[i]))
        temp = list(grouped_keys[i])
        process_set = [" ".join([lemmatizer.lemmatize(l) for l in stopwordremoval(j).split()]) for j in grouped_keys[i]]
        process_set = list(set(process_set))
        for temp_key1 in grouped_keys[i]:
            x = " ".join([lemmatizer.lemmatize(k) for k in stopwordremoval(temp_key1).split()])
            if process_set.count(x)>1:
                temp.remove(temp_key1)   
        grouped_keys[i] = temp
        grouped_keys[i] = sorted(grouped_keys[i])
        
        
    for i in range(len(grouped_keys)):
        temp = list(grouped_keys[i])
        for j in range(len(grouped_keys[i])):
            word = grouped_keys[i][j]
            for k in temp:
                if word in k and word!=k:
                    temp.remove(word)
                    break
        grouped_keys[i] = sorted(temp,key = len, reverse = True)
    grouped_keys = [i for i in grouped_keys if len(i)>0]
    return grouped_keys

In [863]:
grouped_keys = remove_duplicates(grouped_keys)
#grouped_keys

# Flatten Keywords

In [864]:
def finalize(grouped_keys):
    temp_keywords = []
    final_keywords = []
    for i in grouped_keys:
        for j in i:
            temp_keywords.append(j)
    
    temp_keywords = remove_duplicates(group(temp_keywords))
    
    for i in temp_keywords:
        for j in i:
            final_keywords.append(j)
    return final_keywords

In [865]:
final_keywords = finalize(grouped_keys)

In [866]:
#final_keywords

# Rank Keywords??

# Constructing Dictionary Key

In [867]:
def dictionarize(final_keywords, text):
    answer_key = dict()
    sentences = pp_set(text, "token_sent") 
#     print(sentences)
    for i in sentences:
        answer_key[i] = list()
    temp = list(final_keywords)
    for i in range(len(temp)):
        key = " ".join(pp_set(temp[i], "token_word"))
#         print(key)
        for j in answer_key:
            x = j.strip().lower()
            if key in x:
                answer_key[j].append(key)
                final_keywords.remove(temp[i])
                break
    return answer_key

In [868]:
answer_key = dictionarize(final_keywords, key_ml)
#answer_key

# Vectorizer

In [869]:
def vectorize_text(answer_key):
    vector_keys = []
    vector_sent = []
    for i in list(answer_key.keys()):
        vector_sent.append(avg_sentence_vector(pp_set(i, "token_word"), model, 300, model.index2word))
        temp = []
        for j in list(answer_key[i]):
            temp.append(avg_sentence_vector(pp_set(j, "token_word"), model, 300, model.index2word))
        vector_keys.append(temp)
    
    return vector_sent, vector_keys

# Test Answer

In [870]:
# test_ans = """It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or infeasible to develop a conventional algorithm for effectively performing the task."""

In [871]:
kw = extract_keywords(test_ans)
final_kw = finalize(remove_duplicates(group(kw)))
answer_test = dictionarize(final_kw, test_ans)

# Answer dictionaries

In [872]:
group_final = remove_duplicates(group(final_keywords))
new_final = []
for i in group_final:
    if len(i)>1:
        for j in i:
            for k in i:
                if j!=k and j in k:
                    new_final.append(j)
    else:
        new_final.append(i[-1])
#new_final

In [873]:
marks = 0
for i in answer_key:
    marks+=len(answer_key[i])
print(marks)

31


In [890]:
def score(key, test):
    vec_key_sent, vec_key_keys = vectorize_text(key)
    vec_test_sent, vec_test_keys = vectorize_text(test)
    sum = 0
    sims = dict()
    for i in range(len(vec_test_sent)):
#         max_sim = -1
#         max_pair = tuple()
        sims[i] = []
        for j in range(len(vec_key_sent)):
            sim = cosine_similarity(vec_test_sent[i], vec_key_sent[j])
            if sim > 0.7:
#                 max_sim = sim
#                 max_pair = (i,j)
                sims[i].append(j)

    #print(sims)
    count = 0
    for keyidx in sims:
        ans_kw = vec_test_keys[keyidx]
        key_kw = []
        checked = []
        for i in sims[keyidx]:
            key_kw.extend(vec_key_keys[i])
            
        for akw in ans_kw:
            max_sim = -1
            max_kkw = []
            for kkw in key_kw:
                if kkw in checked:
                    continue
                sim = cosine_similarity(kkw, akw)[0][0]
                if sim > max_sim:
                    max_sim = sim
                    max_akw = kkw
            if sim > 0.9:
                sum += 1
#                 count += 1
            else:
                sum += max_sim
#                 count += 1
            checked.append(max_kkw)
    return sum,count                       

In [892]:
test_score, kw_match = score(answer_key, answer_test)
print(test_score)
test_score = test_score/marks*4.0
# print(kw_match)
if (test_score%1)> 0.5:
    rem = 1
else:
    rem = 0
final_score = int(test_score)+rem
print(final_score,"/",4,sep = '')

16.76636290550232
2/4


In [876]:
end_time = time.time()
end_time-start_time

5.662776231765747

In [877]:
answer_key

{'Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions, relying on patterns and inference instead.': ['algorithms and statistical models',
  'explicit instructions',
  'a specific task',
  'inference instead',
  'patterns',
  'perform',
  'the scientific study',
  'statistical model'],
 'It is seen as a subset of artificial intelligence.': ['a subset',
  'artificial intelligence',
  'seen'],
 'Machine learning algorithms build a mathematical model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task.': ['machine learning algorithms',
  'mathematical model based',
  'a mathematical model',
  'decisions',
  'explicitly programmed',
  'make prediction',
  'order',
  'predictions',
  'the task',
  'training data'],
 'Machine learning algorithms are used in a wide variety of applica

In [878]:
answer_test

{'Machine learning is the scientific study of .': ['the scientific study'],
 'F algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions , relying on patterns and reference instead .': ['explicit instructions',
  'a specific task',
  'f algorithms and statistical models',
  'patterns',
  'perform',
  'reference instead',
  'statistical model'],
 'It is seen as a subset of astiffrcial intelligence .': ['a subset',
  'astiffrcial intelligence',
  'seen'],
 'Machine leading algorithms build a mathematical model lugt a sample data known as training .': ['a mathematical model lugt',
  'sample data known',
  'a sample data',
  'machine leading algorithms',
  'training'],
 'Machine learning algorithms ase used in a variety of applications .': ['a variety',
  'algorithms ase',
  'applications'],
 'such as email filtering and computes vision where it is difficult or infeasible to develop a .': ['infeasible',
  'computes vision',
