In [18]:
import jsonrpc
from simplejson import loads
from pprint import pprint
import nltk
import pandas as pd

In [38]:
text = "Obama is the the president of US. Florida is a nice place. It is good. He lives in Florida. Trump is the current president. He owns Trump tower"

In [39]:
type(text)

str

## Using stanford -corenlp and extracting only the COREF part of the result

In [5]:
def coref_resolution(text):
    server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(),
                             jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080)))

    result = loads(server.parse(text))
    return result['coref']

In [6]:
coref_resolution(text)

[[[[u'the the president of US', 0, 4, 2, 7], [u'Obama', 0, 0, 0, 1]],
  [[u'He', 3, 0, 0, 1], [u'Obama', 0, 0, 0, 1]]],
 [[[u'a nice place', 1, 4, 2, 5], [u'Florida', 1, 0, 0, 1]],
  [[u'It', 2, 0, 0, 1], [u'Florida', 1, 0, 0, 1]],
  [[u'Florida', 3, 3, 3, 4], [u'Florida', 1, 0, 0, 1]]],
 [[[u'the current president', 4, 4, 2, 5], [u'Trump', 4, 0, 0, 1]],
  [[u'He', 5, 0, 0, 1], [u'Trump', 4, 0, 0, 1]],
  [[u'Trump', 5, 2, 2, 3], [u'Trump', 4, 0, 0, 1]]]]

## Sentence and word tokenization of the text

In [7]:
def tokenize_text(text):
    token_sen = nltk.sent_tokenize(text)
    word = []
    for i in range(len(token_sen)):
        word.append(nltk.word_tokenize(token_sen[i]))
    return word


In [10]:
tokenize_text(text)

[['Obama', 'is', 'the', 'the', 'president', 'of', 'US', '.'],
 ['Florida', 'is', 'a', 'nice', 'place', '.'],
 ['It', 'is', 'good', '.'],
 ['He', 'lives', 'in', 'Florida', '.'],
 ['Trump', 'is', 'the', 'current', 'president', '.'],
 ['He', 'owns', 'Trump', 'tower']]

## Function to rephrase the text with the coreference result

In [15]:
##list prps and nn 

pronouns = ['PRP', 'PRP$']
nouns = ['NNP', 'NN', 'NNS', 'NNPS']

In [16]:
#compressed function
def coref_rephrase(text):
    coref = coref_resolution(text)
    process_text = tokenize_text(text)
        
    for i in coref:
        for j in i:
       
            pos_tag_left = nltk.pos_tag([j[0][0]])
            pos_tag_right = nltk.pos_tag([j[1][0]])
            
            
            #print a[j[0][1]]
            #print pos_tag_left[0][0], " | ", pos_tag_left[0][1], " | ", pos_tag_right[0][0], " | ", pos_tag_right[0][1]
            #a being the word tokenizer
            if pos_tag_left[0][1] in pronouns and pos_tag_right[0][1] in nouns:
                if pos_tag_left[0][0] in process_text[j[0][1]]:
                
                   process_text[j[0][1]][process_text[j[0][1]].index(pos_tag_left[0][0])] = pos_tag_right[0][0]
                    
                
                           
    rephrase = [[' '.join(w) for w in process_text]]
    return rephrase

In [17]:
coref_rephrase(text)

[['Obama is the the president of US .',
  'Florida is a nice place .',
  u'Florida is good .',
  u'Obama lives in Florida .',
  'Trump is the current president .',
  u'Trump owns Trump tower']]

# Working on SQUAD dataset

In [19]:
squad_dev = pd.read_json('/data/squad_dataset/data/squad_dev_doc.json')
train = pd.read_json('/data/squad_dataset/data/squad_train_doc.json')


In [20]:
train.head()

Unnamed: 0,passages,title
0,[{u'questions': [u'The Basilica of the Sacred ...,University_of_Notre_Dame
1,[{u'questions': [u'What was the first album Be...,Beyoncé
2,[{u'questions': [u'What is its rank in popular...,Montana
3,[{u'questions': [u'Which phrase is especially ...,Genocide
4,[{u'questions': [u'What is resistance to antib...,Antibiotics


In [22]:
len(train['passages'])  #442 rows

442

In [29]:
len(train['passages'][0])
print(train['passages'][0][0]['context'])

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


In [32]:
#running coref on a single context from the 2nd passage- titled "Beyonce"
#need to clean the data and remove non ascii characters
result = train['passages'][1][0]['context'].encode('ascii', 'ignore')

print(result)
print('*******************************************************************************************************')
modified_result = ''.join( c for c in result if  c not in '/\'') 
print(modified_result)

Beyonc Giselle Knowles-Carter (/bijnse/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
*******************************************************************************************************
Beyonc Giselle Knowles-Carter (bijnse bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing

In [35]:
coref_resolution(modified_result)

[[[[u'an American singer', 0, 7, 15, 18],
   [u'Beyonc Giselle Knowles-Carter -LRB- bijnse bee-YON-say -RRB- -LRB- born September 4 , 1981 -RRB-',
    0,
    2,
    0,
    14]],
  [[u'she', 1, 8, 8, 9],
   [u'Beyonc Giselle Knowles-Carter -LRB- bijnse bee-YON-say -RRB- -LRB- born September 4 , 1981 -RRB-',
    0,
    2,
    0,
    14]],
  [[u'her', 2, 2, 2, 3],
   [u'Beyonc Giselle Knowles-Carter -LRB- bijnse bee-YON-say -RRB- -LRB- born September 4 , 1981 -RRB-',
    0,
    2,
    0,
    14]],
  [[u'her', 3, -1, 19, 20],
   [u'Beyonc Giselle Knowles-Carter -LRB- bijnse bee-YON-say -RRB- -LRB- born September 4 , 1981 -RRB-',
    0,
    2,
    0,
    14]]],
 [[[u'her father', 2, 3, 2, 4], [u'Mathew Knowles', 2, 6, 5, 7]]],
 [[[u'Their', 3, 0, 0, 1],
   [u'best-selling girl groups of all time', 2, 7, 15, 21]]],
 [[[u'Love -LRB- 2003 -RRB-', 3, 2, 12, 16], [u'Love', 3, 1, 41, 42]]]]

In [33]:
#performing coref and rephrasing the passage
coref_rephrase(modified_result)

[['Beyonc Giselle Knowles-Carter ( bijnse bee-YON-say ) ( born September 4 , 1981 ) is an American singer , songwriter , record producer and actress .',
  u'Born and raised in Houston , Texas , Beyonc Giselle Knowles-Carter -LRB- bijnse bee-YON-say -RRB- -LRB- born September 4 , 1981 -RRB- performed in various singing and dancing competitions as a child , and rose to fame in the late 1990s as lead singer of R & B girl-group Destinys Child .',
  u'Managed by Beyonc Giselle Knowles-Carter -LRB- bijnse bee-YON-say -RRB- -LRB- born September 4 , 1981 -RRB- father , Mathew Knowles , the group became one of the worlds best-selling girl groups of all time .',
  u"best-selling girl groups of all time hiatus saw the release of Beyoncs debut album , Dangerously in Love ( 2003 ) , which established Beyonc Giselle Knowles-Carter -LRB- bijnse bee-YON-say -RRB- -LRB- born September 4 , 1981 -RRB- as a solo artist worldwide , earned five Grammy Awards and featured the Billboard Hot 100 number-one sin

In [34]:
%timeit(coref_rephrase(modified_result))

1 loop, best of 3: 1.31 s per loop


In [None]:
##running the excerpt of a fiction through the coref_rephrase()

In [48]:
chesney = ' Chesney went downstairs to his locker and came back with the five chairs. Nudging open the apartment door, he was surprised to see a little blonde girl in pinafore and ankle socks standing beside the table. "Are you lost?" he said. "I just got one question," she said. Actually, the voice asking the question came not from the girl but from the fanged mouth of the rubyred snake that uncoiled itself where a tongue would have been if this had really been a little girl instead of another demon. He put down the chairs.'

In [49]:
chesney

' Chesney went downstairs to his locker and came back with the five chairs. Nudging open the apartment door, he was surprised to see a little blonde girl in pinafore and ankle socks standing beside the table. "Are you lost?" he said. "I just got one question," she said. Actually, the voice asking the question came not from the girl but from the fanged mouth of the rubyred snake that uncoiled itself where a tongue would have been if this had really been a little girl instead of another demon. He put down the chairs.'

In [50]:
coref_rephrase(chesney)

[[u'Chesney went downstairs to Chesney locker and came back with the five chairs .',
  u'Nudging open the apartment door , Chesney was surprised to see a little blonde girl in pinafore and ankle socks standing beside the table .',
  "`` Are you lost ? ''",
  'he said .',
  u"`` a little blonde girl just got one question , '' a little blonde girl said .",
  u'Actually , the voice asking the question came not from the girl but from the fanged mouth of the rubyred snake that uncoiled the voice asking the question where a tongue would have been if this had really been a little girl instead of another demon .',
  u'another demon put down the chairs .']]