# Neuralcoref for Gendered pronoun resolution


## Import libraries

In [54]:
import csv
import logging;
logging.basicConfig(level=logging.INFO)
import spacy
import neuralcoref
import pandas as pd
import gap_scorer

## Load datasets

### Development dataset

In [2]:
gap_development = pd.read_csv('gap-development.tsv', sep="\t")
for x in ["Text", "Pronoun", "A", "B"]:
    gap_development[x] = gap_development[x].str.lower()
gap_development.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,development-1,zoe telford -- played the police officer girlf...,her,274,cheryl cassidy,191,True,pauline,207,False,http://en.wikipedia.org/wiki/List_of_Teachers_...
1,development-2,"he grew up in evanston, illinois the second ol...",his,284,mackenzie,228,True,bernard leach,251,False,http://en.wikipedia.org/wiki/Warren_MacKenzie
2,development-3,"he had been reelected to congress, but resigne...",his,265,angeloz,173,False,de la sota,246,True,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_...
3,development-4,the current members of crime have also perform...,his,321,hell,174,False,henry rosenthal,336,True,http://en.wikipedia.org/wiki/Crime_(band)
4,development-5,her santa fe opera debut in 2005 was as nuria ...,she,437,kitty oppenheimer,219,False,rivera,294,True,http://en.wikipedia.org/wiki/Jessica_Rivera


In [3]:
gap_development_A_coref = (gap_development["A-coref"] == True)
gap_development_B_coref = (gap_development["B-coref"] == True)

print("TOTAL ROW", "\t\t", len(gap_development[~gap_development_A_coref & ~gap_development_B_coref]))
print("A = False & B = False", "\t", len(gap_development[~gap_development_A_coref & ~gap_development_B_coref]))
print("A = False & B = True ", "\t", len(gap_development[~gap_development_A_coref &  gap_development_B_coref]))
print("A = True  & B = False", "\t", len(gap_development[ gap_development_A_coref & ~gap_development_B_coref]))
print("A = True  & B = True ", "\t", len(gap_development[ gap_development_A_coref &  gap_development_B_coref]))
print("------------------------------")
print("TOTAL ROW", "\t\t", len(gap_development))

TOTAL ROW 		 201
A = False & B = False 	 201
A = False & B = True  	 925
A = True  & B = False 	 874
A = True  & B = True  	 0
------------------------------
TOTAL ROW 		 2000


In [4]:
set(gap_development["Pronoun"])

{'he', 'her', 'him', 'his', 'she'}

### Validation dataset

In [5]:
gap_validation = pd.read_csv('gap-validation.tsv', sep="\t")
for x in ["Text", "Pronoun", "A", "B"]:
    gap_validation[x] = gap_validation[x].str.lower()
gap_validation.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,validation-1,he admitted making four trips to china and pla...,him,256,jose de venecia jr,208,False,abalos,241,False,http://en.wikipedia.org/wiki/Commission_on_Ele...
1,validation-2,"kathleen nott was born in camberwell, london. ...",she,185,ellen,110,False,kathleen,150,True,http://en.wikipedia.org/wiki/Kathleen_Nott
2,validation-3,"when she returns to her hotel room, a liberian...",his,435,jason scott lee,383,False,danny,406,True,http://en.wikipedia.org/wiki/Hawaii_Five-0_(20...
3,validation-4,"on 19 march 2007, during a campaign appearance...",he,333,reucassel,300,True,debnam,325,False,http://en.wikipedia.org/wiki/Craig_Reucassel
4,validation-5,"by this time, karen blixen had separated from ...",she,427,finch hatton,290,False,beryl markham,328,True,http://en.wikipedia.org/wiki/Denys_Finch_Hatton


In [6]:
gap_validation_A_coref = (gap_validation["A-coref"] == True)
gap_validation_B_coref = (gap_validation["B-coref"] == True)

print("A = False & B = False", "\t", len(gap_validation[~gap_validation_A_coref & ~gap_validation_B_coref]))
print("A = False & B = True ", "\t", len(gap_validation[~gap_validation_A_coref &  gap_validation_B_coref]))
print("A = True  & B = False", "\t", len(gap_validation[ gap_validation_A_coref & ~gap_validation_B_coref]))
print("A = True  & B = True ", "\t", len(gap_validation[ gap_validation_A_coref &  gap_validation_B_coref]))
print("------------------------------")
print("TOTAL ROW", "\t\t", len(gap_validation))

A = False & B = False 	 62
A = False & B = True  	 205
A = True  & B = False 	 187
A = True  & B = True  	 0
------------------------------
TOTAL ROW 		 454


In [7]:
set(gap_validation["Pronoun"])

{'he', 'her', 'him', 'his', 'she'}

### Stage 1 dataset

In [8]:
test_stage1 = pd.read_csv('test_stage_1.tsv', sep="\t")
for x in ["Text", "Pronoun", "A", "B"]:
    test_stage1[x] = test_stage1[x].str.lower()
test_stage1.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,B,B-offset,URL
0,development-1,zoe telford -- played the police officer girlf...,her,274,cheryl cassidy,191,pauline,207,http://en.wikipedia.org/wiki/List_of_Teachers_...
1,development-2,"he grew up in evanston, illinois the second ol...",his,284,mackenzie,228,bernard leach,251,http://en.wikipedia.org/wiki/Warren_MacKenzie
2,development-3,"he had been reelected to congress, but resigne...",his,265,angeloz,173,de la sota,246,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_...
3,development-4,the current members of crime have also perform...,his,321,hell,174,henry rosenthal,336,http://en.wikipedia.org/wiki/Crime_(band)
4,development-5,her santa fe opera debut in 2005 was as nuria ...,she,437,kitty oppenheimer,219,rivera,294,http://en.wikipedia.org/wiki/Jessica_Rivera


In [9]:
set(test_stage1["Pronoun"])
min(test_stage1["Pronoun"])

'he'

### Stage 2 dataset

In [10]:
test_stage2 = pd.read_csv('test_stage_2.tsv', sep="\t")
for x in ["Text", "Pronoun", "A", "B"]:
    test_stage2[x] = test_stage2[x].str.lower()
test_stage2.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,B,B-offset,URL
0,000075809a8e6b062f5fb3c191a8ed52,"for the u.s. under secretary of state, see luc...",she,310,lucy benson,59,kerrie taylor,160,http://en.wikipedia.org/wiki/Lucy_Benson
1,0005d0f3b0a6c9ffbd31a48453029911,"after this match, she reached her new career h...",she,334,kudryavtseva,182,maria sharapova,259,http://en.wikipedia.org/wiki/Alla_Kudryavtseva
2,0007775c40bedd4147a0573d66dc28f8,in the same way in his preface of the books of...,his,298,ezra,191,jerome,323,http://en.wikipedia.org/wiki/Development_of_th...
3,001194e3fe1234d00198ef6bba4cc588,anita's so-called homeless mate machteld steen...,she,313,dian,205,anita,278,http://en.wikipedia.org/wiki/Dian_Alberts
4,0014bb7085278ef3f9b74f14771caca9,"by march, she was the king's mistress, install...",her,362,pompadour,262,jeanne antoinette,336,http://en.wikipedia.org/wiki/Madame_de_Pompadour


In [11]:
set(gap_validation["Pronoun"])

{'he', 'her', 'him', 'his', 'she'}

## Load Spacy and NeuralCoref

In [12]:
nlp = spacy.load('en')
nlp

<spacy.lang.en.English at 0x7f0e19888278>

In [13]:
neuralcoref.add_to_pipe(nlp)
neuralcoref

<module 'neuralcoref' from '/home/gaspare/git/Gendered-Pronoun-Resolution/neuralcoref/venv/lib/python3.7/site-packages/neuralcoref/__init__.py'>

In [21]:
## NeuralCoref handler function

In [48]:
def neuralCorefHandler(text):
    doc = nlp(text)

    idx = 0
    A_coref = False
    B_coref = False
    
    for k in doc:
        idx += text.index(str(k))

        if idx == example["Pronoun-offset"]:
            cluster = k._.coref_clusters
            for c in cluster:
                for d in c:
                    A_coref = A_coref or example["A"] in str(d)
                    B_coref = B_coref or example["B"] in str(d)
        idx += len(k)
        text = text[text.index(str(k))+len(k):]
    return A_coref, B_coref, doc
    

## CoReference example

In [27]:
example = gap_development.loc[1]
print("TEXT =", example["Text"])
print("PRONOUN =", example["Pronoun"], example["Pronoun-offset"])
print("A =", example["A"], example["A-offset"])
print("B =", example["B"], example["B-offset"])

TEXT = he grew up in evanston, illinois the second oldest of five children including his brothers, fred and gordon and sisters, marge (peppy) and marilyn. his high school days were spent at new trier high school in winnetka, illinois. mackenzie studied with bernard leach from 1949 to 1952. his simple, wheel-thrown functional pottery is heavily influenced by the oriental aesthetic of shoji hamada and kanjiro kawai.
PRONOUN = his 284
A = mackenzie 228
B = bernard leach 251


In [63]:
with open("gap-validation-submission.tsv", "w") as gap_validation_submission:
    for i in range(1): # len(gap_development)):
        line = gap_development.loc[i]
        A_coref, B_coref, doc = neuralCorefHandler(line["Text"])
        gap_validation_submission.write(line["ID"] + "," + str(0.3333) + "," + str(0.3333) + "," + str(0.3333) + "\n")
        

In [73]:
line = gap_development.loc[0]
A_coref, B_coref, doc = neuralCorefHandler(line["Text"])

for sent in doc.sents:
    for token in sent:
        print(token.text, token.i)

zoe 0
telford 1
-- 2
played 3
the 4
police 5
officer 6
girlfriend 7
of 8
simon 9
, 10
maggie 11
. 12
dumped 13
by 14
simon 15
in 16
the 17
final 18
episode 19
of 20
series 21
1 22
, 23
after 24
he 25
slept 26
with 27
jenny 28
, 29
and 30
is 31
not 32
seen 33
again 34
. 35
phoebe 36
thomas 37
played 38
cheryl 39
cassidy 40
, 41
pauline 42
's 43
friend 44
and 45
also 46
a 47
year 48
11 49
pupil 50
in 51
simon 52
's 53
class 54
. 55
dumped 56
her 57
boyfriend 58
following 59
simon 60
's 61
advice 62
after 63
he 64
would 65
n't 66
have 67
sex 68
with 69
her 70
but 71
later 72
realised 73
this 74
was 75
due 76
to 77
him 78
catching 79
crabs 80
off 81
her 82
friend 83
pauline 84
. 85


## Scorer

In [61]:
gap_scorer.run_scorer("gap-validation.tsv", "gap-validation-submission.tsv")

AttributeError: 'NoneType' object has no attribute 'lower'