In [0]:
import json
data = json.load(open('/content/drive/My Drive/Information_Retrieval/dev-v1.1_full.json','r'))

In [2]:
from textblob import TextBlob
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


# **WORD2VEC MODEL**

In [0]:
corpus = []

for l in data['data']:
  for para in l['paragraphs']:
    corpus.append(para['context'])

In [4]:
len(corpus)

2067

In [0]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus(object):
    def __iter__(self): 
        preprocessed_text_column = corpus
        for line in preprocessed_text_column:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [0]:
import gensim.models
sentences = MyCorpus()
skipGramModel = gensim.models.Word2Vec(sentences=sentences,sg = 1,workers=4,iter = 70,batch_words=1,min_count=1)

# **COSINE SIMILARITY**

In [7]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
correctly_retrieved_answers = 0
total_retrieved_answers = 0
exact_matches = 0
total_number_of_answers = 0
for title in data['data']:
  for para in title['paragraphs']:

    #######################Paragraph Context#########################
    context = para['context']
    sentence_list = [item.raw for item in TextBlob(context).sentences]
    sentence_vectors = []
    for s in sentence_list:   
        words_in_s = utils.simple_preprocess(s)
        s_vector = np.zeros(100)
        #generating vector for the each sentence in the context
        for word in words_in_s:
          try:
            s_vector+=skipGramModel.wv.get_vector(word)
          except:
            pass
        sentence_vectors.append(s_vector)
    
    #######################Questions and Answers######################
    questions_and_answers = para['qas']
    #it is a list of dicts
    for qa in questions_and_answers:
      answers = qa['answers']
      answers_text = []
      for answer in answers:
        answers_text.append(answer['text'])
      actual_number_of_answers = len(answers_text)
      #####################Calculating Question Vector################
      question = qa['question']
      processed_question = utils.simple_preprocess(question)
      question_vector = np.zeros(100)
      for word in processed_question:
        try:
          question_vector+=skipGramModel.wv.get_vector(word)
        except:
          pass

      ######################Finding Top Matching Sentences###########
      similarity = []
      for i in range(len(sentence_vectors)):
        sentence_vector = sentence_vectors[i]
        similarity.append([cosine_similarity([sentence_vector],[question_vector])[0][0],i])

      similarity.sort(reverse=True)
      top_matching_sentences = [ sentence_list[i] for v,i in similarity[0:actual_number_of_answers] ]
      ###################Calculating F1-score and Exact Match###########
      total_number_of_answers+=actual_number_of_answers
      total_retrieved_answers+=len(top_matching_sentences)

      for ans in answers_text:
        for sentence in top_matching_sentences:
          if ans.strip() in sentence:
            correctly_retrieved_answers+=1
            break
      count = 0
      if correctly_retrieved_answers > 0:
        for i in range(len(answers_text)):
          for j in range(len(top_matching_sentences)):
            if answers_text[i] == top_matching_sentences[j]:
              count+=1
              break
      exact_matches+=count  


precision = correctly_retrieved_answers/total_retrieved_answers
recall = correctly_retrieved_answers/total_number_of_answers
f1_score = 0
if precision+recall != 0:
    f1_score = 2*precision*recall/(precision+recall)

print("F1-Score: {} Exact Match:{}".format(f1_score,exact_matches/total_number_of_answers))

F1-Score: 0.982026530853962 Exact Match:0.0005183436042158614


# **DEPENDENCY GRAPH**

In [30]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy
en_nlp = spacy.load('en')

f1_scores = []
exact_matches = 0
total_number_of_answers = 0
for title in data['data']:
  for para in title['paragraphs']:

    #######################Paragraph Context#########################
    context = para['context']
    sentence_list = [item.raw for item in TextBlob(context).sentences]
    sentence_vectors = []
    sentence_root_words = []
    for s in sentence_list:   
        words_in_s = utils.simple_preprocess(s)
        s_vector = np.zeros(100)
        #generating vector for the each sentence in the context
        for word in words_in_s:
          s_vector+=skipGramModel.wv.get_vector(word)
        sentence_vectors.append(s_vector)
        #finding root word for each sentence in the context
        root_word = [str(sent.root)  for sent in en_nlp(s).sents][0]
        sentence_root_words.append(root_word)
    
    #######################Questions and Answers####################################
    questions_and_answers = para['qas']
    for qa in questions_and_answers:
      answers = qa['answers']
      answers_text = []
      for answer in answers:
        answers_text.append(answer['text'])
      actual_number_of_answers = len(answers_text)
      #####################Calculating Question Vector and Root Word################
      question = qa['question']
      processed_question = utils.simple_preprocess(TextBlob(question).sentences[0].raw)
      question_vector = np.zeros(100)
      for word in processed_question:
        try:
          question_vector+=skipGramModel.wv.get_vector(word)
        except:
          pass
      question_root_word = [str(sent.root)  for sent in en_nlp(question).sents][0]
      ######################Finding Top Matching Sentences###########################
      top_matching_sentences = [] 
      top_matching_sentence_vectors = []
      for i in range(len(sentence_root_words)):
        if question_root_word == sentence_root_words[i]:
          top_matching_sentences.append(sentence_list[i])
          top_matching_sentence_vectors.append(sentence_vectors[i])

      ############Breaking ties using cosine similarity##############################
      if len(top_matching_sentences) > actual_number_of_answers:
        sim = []
        for i in range(len(top_matching_sentence_vectors)):
          sim.append([cosine_similarity([question_vector],[top_matching_sentence_vectors[i]])[0][0],top_matching_sentences[i]])
        sim.sort(reverse = True)
        top_matching_sentences = [sent for simi,sent in sim]

      ###################Calculating F1-score and Exact Match########################
      total_number_of_answers+=actual_number_of_answers
      correctly_retrieved_answers = 0

      for ans in answers_text:
        for sentence in top_matching_sentences:
          if ans.strip() in sentence:
            correctly_retrieved_answers+=1
            break

      count = 0
      if correctly_retrieved_answers > 0:
        for i in range(len(answers_text)):
          for j in range(len(top_matching_sentences)):
            if answers_text[i] == top_matching_sentences[j]:
              count+=1
              break
      exact_matches+=count
   
      precision = correctly_retrieved_answers/len(top_matching_sentences) if len(top_matching_sentences) > 0 else 0
      recall = correctly_retrieved_answers/actual_number_of_answers
      f1_score = 0
      if precision+recall != 0:
        f1_score = 2*precision*recall/(precision+recall)
      f1_scores.append(f1_score)

print("F1-Score: {} Exact Match:{}".format(sum(f1_scores)/len(f1_scores),exact_matches/total_number_of_answers))

F1-Score: 0.22880083440159077 Exact Match:0.0001727812014052871


# **SUMMARY**

In [31]:
import pandas as pd

data = [['COSINE SIMILARITY',0.9820,0.000518],['DEPENDENCY PARSING',0.2288,0.000172]]
dataframe = pd.DataFrame(data=data,columns = ['METHOD','F1 SCORE','EXACT MATCH'])
dataframe

Unnamed: 0,METHOD,F1 SCORE,EXACT MATCH
0,COSINE SIMILARITY,0.982,0.000518
1,DEPENDENCY PARSING,0.2288,0.000172
