In [None]:
!pip install -U spacy==2.1.0
!python -m spacy download en

!pip install Cython --install-option="–no-cython-compile"

!pip install neuralcoref

!pip install benepar

!python -m spacy download en_core_web_md

!spacy download en

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==2.1.0
  Downloading spacy-2.1.0-cp37-cp37m-manylinux1_x86_64.whl (27.7 MB)
[K     |████████████████████████████████| 27.7 MB 23.4 MB/s 
[?25hCollecting thinc<7.1.0,>=7.0.2
  Downloading thinc-7.0.8-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 44.3 MB/s 
Collecting preshed<2.1.0,>=2.0.1
  Downloading preshed-2.0.1-cp37-cp37m-manylinux1_x86_64.whl (82 kB)
[K     |████████████████████████████████| 82 kB 359 kB/s 
Collecting blis<0.3.0,>=0.2.2
  Downloading blis-0.2.4-cp37-cp37m-manylinux1_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 42.5 MB/s 
[?25hCollecting plac<1.0.0,>=0.9.6
  Downloading plac-0.9.6-py2.py3-none-any.whl (20 kB)
Collecting jsonschema<3.0.0,>=2.6.0
  Downloading jsonschema-2.6.0-py2.py3-none-any.whl (39 kB)
Collecting srsly<1.1.0,>=0.0.5
  Downloading srsly-1.0.5-cp37-cp37m-manylin

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import pandas as pd

import spacy
import benepar
import neuralcoref

from nltk.tokenize import sent_tokenize

class MicroStatments:
  """
  return microstatemnts from input mwp
  coref resolved and conjunction resolved.
  """
  
  def __init__(self):
    """
    initialising the neuralcoref pipeline
    """
    self.nlp_pronoun = spacy.load('en')
    neuralcoref.add_to_pipe(self.nlp_pronoun, greedyness=0.52)
    benepar.download('benepar_en3')
    self.nlp = spacy.load('en_core_web_md')
    if spacy.__version__.startswith('2'):
        self.nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
    else:
        self.nlp.add_pipe("benepar", config={"model": "benepar_en3"})

  def replace(self, sentence):
    """ 
    Neural coref resolution function
    @input : sentence without coref resolution
    @output : coref resolved sentence 
    """
    doc = self.nlp_pronoun(sentence)
    return doc._.coref_resolved

  def __extract_sub_verb_object(self, sentence):
    """
    @input : sentence with coref resolution
    @output : {sub: subject of sentence, 
                verb: verb of sentence, 
                object : object of sentence}
    
    Performed using Parsing a dependenct tree generated by benepar (berkeley neural parser)
    """

    # initialise empty strings for S, V and O
    subject = ""
    verb_phrase = ""
    preposition_phrase = ""
    #dependency_tree = sent1._.parse_string
    doc = self.nlp(sentence)
    sent = list(doc.sents)[0]
    dependency_tree = sent._.parse_string

    # 3 flags to keep track of which part of sentence we're in
    # assume we always start with subject -> sb = 1
    sb = 1
    vb = 0
    pb = 0

    for i in range(len(dependency_tree)):
      ch = dependency_tree[i]

      if sb == 1: # sub part
        if (ch.islower()==True or ch.isdigit()==True or ch==" " or ch=="," ):
          if( (subject[:-1]!=" " and ch==" ") or ch!=" "):
              subject = subject+ch

        elif ch=="V": # verb part, set vb = 1
          sb = 0
          vb = 1
          continue

      if vb == 1:
        if (ch.islower()==True or ch.isdigit()==True or ch==" " or ch==","):
          if( (verb_phrase[:-1]!=" " and ch==" ") or ch!=" "):
            verb_phrase = verb_phrase+ch
          
        elif ch=="N" and dependency_tree[i+1] == "P": # obj part, set pb = 1
          vb = 0
          pb = 1
          continue

      if pb == 1:
          if (ch.islower()==True or ch.isdigit()==True or ch==" " or ch==","):
            if( (preposition_phrase[:-1]!=" " and ch==" ") or ch!=" "):
              preposition_phrase = preposition_phrase+ch
      
    return {"subject":subject.strip(), "verb":verb_phrase.strip(), "object":preposition_phrase.strip()}


  def __split_conj(self, sent_list):
    """
    split conjunctions at "and"

    """
    sub = ""
    l = list()

    for i in sent_list:
      if i!="and":
          sub = sub+" "+i
      else:
        if sub!="":
          l.append(sub)
          sub = ""

    l.append(sub)
    return l
  
  def __handle_conjunction(self, sentence):
    d = self.__extract_sub_verb_object(sentence)

    subject = d["subject"].replace(",","and")
    verb = d["verb"].replace(",","and")
    obj = d["object"].replace(",","and")
    subject_tokens = [i for i in subject.split(' ') if i != ' ']
    verb_tokens = [i for i in verb.split(' ') if i!= ' ']
    obj_tokens = [i for i in obj.split(' ') if i!=''] 
    sentences = list()
    if "and" in subject_tokens:
      split_subject = self.__split_conj(subject_tokens)
      
      for i in split_subject:
        sentences.append(i+" "+verb+" "+obj)
      return sentences
    if "and" in obj_tokens:
      
      split_obj = self.__split_conj(obj_tokens)
    
      
      for i in split_obj:
        sentences.append(subject+" " + verb+" "+ i)
        
      return sentences
    else:
      sentences.append(subject+" "+verb+" "+obj)
      return sentences
    
  def mwp_split(self, mwp):
    mwp_split_temp = sent_tokenize(mwp)
    mwp_split = list()
    res = []
    
    for i in mwp_split_temp:
      temp = self.__handle_conjunction(i)
      for j in temp:
        mwp_split.append(j.strip())
    
    for i, sent in enumerate(mwp_split):
      if "and" in sent:
        res.extend(self.__handle_conjunction(sent))
        continue
      res.append(sent)
    
    res = self.__keep_relevant(res)
    
    return res

  def statements(self, mwp):
    mwp_split_temp = sent_tokenize(mwp)
    mwp_split = list()
    res = []
    
    for i in mwp_split_temp:
      temp = self.__handle_conjunction(i)
      for j in temp:
        mwp_split.append(j.strip())
    
    for i, sent in enumerate(mwp_split):
      if "and" in sent:
        res.extend(self.__handle_conjunction(sent))
        continue
      res.append(sent)
    
    res = self.keep_relevant(res)
    
    return res

  def __keep_relevant(self, microsents):
    """
    extracts all nouns from question -> store in a set (qnouns)
    compare nouns of question (qnouns) with set of nouns in each microstatement
    keep only those microstatements which have all nouns in qnouns   

    """
    res = []
    is_noun = lambda pos: pos[:2] == 'NN'
    
    question = microsents[-1]
    qtokens = nltk.word_tokenize(question)
    #print(qtokens)
    #print(nltk.pos_tag(qtokens))
    qnouns = {word for (word, pos) in nltk.pos_tag(qtokens) if is_noun(pos)}
    print(qnouns)
    for sent in microsents:
      tokenized = nltk.word_tokenize(sent)
      nouns = {word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)}
      if len(qnouns.difference(nouns)) == 0:
        res.append(sent)
    return res


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
ms.keep_relevant(['mary had a basket which has 12 peaches'])

{'peaches', 'mary', 'basket'}


['mary had a basket which has 12 peaches']

In [None]:
ms = MicroStatments()

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


In [None]:
ms.mwp_split("john and rebecca had 2 apples and 2 oranges. how many oranges do rebecca and john both have")

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


['john had  2 oranges',
 'rebecca had  2 oranges',
 'how  many  oranges do  rebecca',
 'how  many  oranges do  john both have']

In [None]:
ms.statements("john and rebecca had 2 apples and 2 oranges. john then handed rebecca 2 apples. how many apples does rebecca have?")

  'with `validate_args=False` to turn off validation.')


['rebecca had  2 apples',
 'john then         handed  rebecca      2     apples ',
 'how  many  apples does rebecca   have']

In [None]:
import spacy
nlp_pronoun = spacy.load('en')
import neuralcoref
neuralcoref.add_to_pipe(nlp_pronoun,greedyness=0.52)

def replace(sentence):
  doc = nlp_pronoun(sentence)
  return doc._.coref_resolved

In [None]:
import pandas as pd

df = pd.read_json('SVAMP.json')
#df['Body'] = [replace(df['Body'][i] + ' ' + df['Question'][i]) for i in range(len(df))]

In [None]:
df

Unnamed: 0,ID,Body,Question,Equation,Answer,Type
0,chal-1,Each pack of dvds costs 76 dollars. If there i...,How much do you have to pay to buy each pack?,( 76.0 - 25.0 ),51,Subtraction
1,chal-2,Dan had $ 3 left with him after he bought a ca...,How much did the candy bar cost?,( 4.0 - 3.0 ),1,Subtraction
2,chal-3,Paco had 26 salty cookies and 17 sweet cookies...,How many salty cookies did Paco have left?,( 26.0 - 9.0 ),17,Subtraction
3,chal-4,43 children were riding on the bus. At the bus...,How many children got off the bus at the bus s...,( 43.0 - 21.0 ),22,Subtraction
4,chal-5,28 children were riding on the bus. At the bus...,How many more children got on the bus than tho...,( 30.0 - 28.0 ),2,Subtraction
...,...,...,...,...,...,...
995,chal-996,Paige was helping her mom plant flowers and to...,How many flower beds did they have?,( 36.0 / 12.0 ),3,Common-Division
996,chal-997,"At the zoo, a cage had 3 snakes and 75 alligat...",How many alligators were not hiding?,( 75.0 - 19.0 ),56,Subtraction
997,chal-998,Paige was helping her mom plant flowers and to...,How many flowers did they grow?,( 60.0 * ( 55.0 / 15.0 ) ),220,Multiplication
998,chal-999,Mary is baking a cake. The recipe calls for 7 ...,How many more cups of sugar does she need to add?,( 7.0 - 4.0 ),3,Subtraction


In [None]:
mwp2 = 'mary had 38 skittles and 12 blueberries. she gave 10 skittles to anna. how many blueberries does mary have'
ms.mwp_split(mwp2)


In [None]:
import random
#i = random.randint(0, 999)
i = 760
print('question number: ', i)
split = ' ' if df['Body'][i].endswith('.') else '. '
mwp = replace(df['Body'][i] + split + df['Question'][i])
mwp = mwp.lower()
print(mwp)
print(ms.mwp_split(mwp))
ms.statements(mwp)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


question number:  760
each basket of peaches has 19 red peaches and 4 green peaches. if there are 15 such baskets. how many peaches are in 15 such baskets altogether?


  'with `validate_args=False` to turn off validation.')


['each  basket   of   peaches has  19 red peaches', 'each  basket   of   peaches has  4 green peaches', 'if    there are 15  such  baskets', 'how  many  peaches are   in 15  such  baskets   altogether']
{'baskets', 'peaches'}


['how  many  peaches are   in 15  such  baskets   altogether']

In [None]:
qtokens = nltk.word_tokenize('how much farther did the grasshopper jump than the mouse')
tags = [pos for (word, pos) in nltk.pos_tag(qtokens)]
tags

In [None]:
mwp = "last week fred had 114 dollars and jason had 22 dollars. they washed cars over the weekend and now fred has 21 dollars and jason has 78 dollars. how much money did jason make over the weekend?"
ms.mwp_split(mwp)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'with `validate_args=False` to turn off validation.')


['last  week    fred had  114 dollars',
 'last  week    fred had  jason had 22 dollars',
 'they washed  cars over the weekend',
 'they washed  now fred has 21 dollars',
 'they washed  jason has 78 dollars',
 'how  much  money did jason   make   over   the  weekend']