# NL2SH dataset


In [1]:
from nl2sh import reader

In [2]:
dataset = reader.read_dataset("nl2sh_dataset.txt")

First setence

In [3]:
dataset[0]

Animals need water.

Sentence NL annotations

In [4]:
sent = dataset[42]

print("SENTENCE:", sent)
print()


print(f"sent_i\ti\ttext\tspace\tlemma\tpos\ttag\tdep\thead")
print("-"*80)
for tok in sent:
    print(f"{tok.sent_i}\t{tok.i}\t{tok.text}\t{tok.space}\t{tok.lemma}\t{tok.pos}\t{tok.tag}\t{tok.dep}\t{tok.head}")

print()
print("NER:", sent.ner)
print("SRL:",sent.srl)
print("COREF:",sent.coref)


SENTENCE: Penny has a new ball, but she still prefers the old one.

sent_i	i	text	space	lemma	pos	tag	dep	head
--------------------------------------------------------------------------------
42	0	Penny	True	Penny	PROPN	NNP	nsubj	has
42	1	has	True	have	VERB	VBZ	ROOT	has
42	2	a	True	a	DET	DT	det	ball
42	3	new	True	new	ADJ	JJ	amod	ball
42	4	ball	False	ball	NOUN	NN	dobj	has
42	5	,	True	,	PUNCT	,	punct	has
42	6	but	True	but	CCONJ	CC	cc	has
42	7	she	True	she	PRON	PRP	nsubj	prefers
42	8	still	True	still	ADV	RB	advmod	prefers
42	9	prefers	True	prefer	VERB	VBZ	conj	has
42	10	the	True	the	DET	DT	det	one
42	11	old	True	old	ADJ	JJ	amod	one
42	12	one	False	one	NUM	CD	dobj	prefers
42	13	.	False	.	PUNCT	.	punct	prefers

NER: (PERSON Penny ,)
SRL: {V has : (ARG0 Penny , ARG1 a new ball), V prefers : (ARG0 she , ARGM-TMP still , ARG1 the old one)}
COREF: {REF she : MAIN Penny }


# Sentence hyperedge

In [5]:
edge = sent.hyperedge
print("HYPEREDGE:")
print(edge)
print()
print("ATOMS:")
for atom in edge.atoms():
    print(atom)
print()
print("SUBEDGES:")
for subedge in edge.subedges():
    print(subedge)
                    

HYPEREDGE:
(but/J (has/Pd.so:01:01.|f--3s:01 penny/Cp..s.p (a/Md.< (new/Ma.< ball/Cc..s))) (prefers/Pd.s?o:0t1:0t1.|f--3s:012 (+/Jc.rm.rp she/Ci penny/Cp..s.p) still/M (the/Md.< (old/Ma.< one/C#))))

ATOMS:
but/J
has/Pd.so:01:01.|f--3s:01
penny/Cp..s.p
a/Md.<
new/Ma.<
ball/Cc..s
prefers/Pd.s?o:0t1:0t1.|f--3s:012
+/Jc.rm.rp
she/Ci
penny/Cp..s.p
still/M
the/Md.<
old/Ma.<
one/C#

SUBEDGES:
(but/J (has/Pd.so:01:01.|f--3s:01 penny/Cp..s.p (a/Md.< (new/Ma.< ball/Cc..s))) (prefers/Pd.s?o:0t1:0t1.|f--3s:012 (+/Jc.rm.rp she/Ci penny/Cp..s.p) still/M (the/Md.< (old/Ma.< one/C#))))
but/J
(has/Pd.so:01:01.|f--3s:01 penny/Cp..s.p (a/Md.< (new/Ma.< ball/Cc..s)))
has/Pd.so:01:01.|f--3s:01
penny/Cp..s.p
(a/Md.< (new/Ma.< ball/Cc..s))
a/Md.<
(new/Ma.< ball/Cc..s)
new/Ma.<
ball/Cc..s
(prefers/Pd.s?o:0t1:0t1.|f--3s:012 (+/Jc.rm.rp she/Ci penny/Cp..s.p) still/M (the/Md.< (old/Ma.< one/C#)))
prefers/Pd.s?o:0t1:0t1.|f--3s:012
(+/Jc.rm.rp she/Ci penny/Cp..s.p)
+/Jc.rm.rp
she/Ci
penny/Cp..s.p
still/M
(the/Md.

# Statistics

In [6]:
import pandas as pd

config = {"token": lambda sent: len(sent),
          "word": lambda sent: len([t.text for t in sent if t.dep not in ("punct", "dep")]),
          "clause": lambda sent: len([v for v, args in sent.srl.items() if len(args) > 0]),
          "entity": lambda sent: len(sent.ner),
          "coref": lambda sent: len(sent.coref),
          "roleset": lambda sent: len([t for t in sent if t.roleset]),
          "synset": lambda sent: len([t for t in sent if t.synset]),
          "atom": lambda sent: len(list(sent.hyperedge.atoms())),
          "hyperedge": lambda sent: len([se for se in sent.hyperedge.subedges() if not se.is_atom()]),
          }

V = {t.text for sent in dataset for t in sent}


data = {name: [] for name in config}
for sent in dataset:
    for name, func in config.items():
        count = func(sent)
        data[name].append(count)


print("Sentences", len(dataset))
print("Vocabulary", len(V))


df = pd.DataFrame(data)
xf = df.describe().T
xf["count"] =df.sum()
xf = xf.astype({"min": int, "max": int, "25%": int, "50%": int, "75%": int})
xf.round(3)






Sentences 664
Vocabulary 1567


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
token,6851,10.318,4.774,2,8,10,12,65
word,5968,8.988,4.13,1,7,8,10,48
clause,1267,1.908,0.89,0,1,2,2,12
entity,254,0.383,0.76,0,0,0,1,6
coref,201,0.303,0.609,0,0,0,0,5
roleset,1599,2.408,1.167,0,2,2,3,13
synset,3206,4.828,2.287,0,3,5,6,22
atom,7077,10.658,7.281,1,7,9,12,123
hyperedge,4311,6.492,4.48,0,4,6,8,68
