In [4]:
import pandas as pd
from matplotlib import pyplot as plt
%run ../script/webnlg.py
%matplotlib inline

pd.set_option('max_colwidth', 1000)

# Dataset Stats

In [5]:
corpus = WebNLGCorpus()

corpus.datasets_size()

ntriples,1,2,3,4,5,6,7,All
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Airport,301,193,187,207,202,0,0,1090
Astronaut,72,46,64,82,86,90,90,530
Building,236,171,203,206,156,0,0,972
City,243,0,0,0,0,0,0,243
ComicsCharacter,98,77,64,35,11,0,0,285
Food,272,278,314,323,237,0,0,1424
Monument,38,32,42,48,45,36,26,267
SportsTeam,251,170,170,150,45,0,0,786
University,58,39,58,73,62,62,54,406
WrittenWork,219,202,248,170,98,0,0,937


In [45]:
corpus.datasets_size(normalize=True)

ntriples,1,2,3,4,5,6,7,All
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Airport,0.043372,0.02781,0.026945,0.029827,0.029107,0.0,0.0,0.157061
Astronaut,0.010375,0.006628,0.009222,0.011816,0.012392,0.012968,0.012968,0.076369
Building,0.034006,0.02464,0.029251,0.029683,0.022478,0.0,0.0,0.140058
City,0.035014,0.0,0.0,0.0,0.0,0.0,0.0,0.035014
ComicsCharacter,0.014121,0.011095,0.009222,0.005043,0.001585,0.0,0.0,0.041066
Food,0.039193,0.040058,0.045245,0.046542,0.03415,0.0,0.0,0.205187
Monument,0.005476,0.004611,0.006052,0.006916,0.006484,0.005187,0.003746,0.038473
SportsTeam,0.036167,0.024496,0.024496,0.021614,0.006484,0.0,0.0,0.113256
University,0.008357,0.00562,0.008357,0.010519,0.008934,0.008934,0.007781,0.058501
WrittenWork,0.031556,0.029107,0.035735,0.024496,0.014121,0.0,0.0,0.135014


# Have a look at a sample

In [155]:
corpus.sample('ComicsCharacter', 1).display()

Unnamed: 0,m_subject,m_predicate,m_object
0,Arion_(comicsCharacter),creator,Jan_Duursema


Unnamed: 0,ltext
0,"The comic character, Arion, was created by Jan Duursema."
1,The comic book character Arion was created by Jan Duursema.


In [156]:
corpus.sample('ComicsCharacter', 1).display()

Unnamed: 0,m_subject,m_predicate,m_object
0,Amazing-Man_(comicsCharacter),alternativeName,"""John Aman"""


Unnamed: 0,ltext
0,"John Aman, is the alternative name for the comic character, Amazing-Man."
1,The comic book character Amazing-Man's alter ego is John Aman.
2,John Aman is also known as Amazing-Man in the comics genre.


# Is there any duplicates?

## Is there any modified triple duplicated intra entry?

No

In [162]:
mtriple_count_per_entry = corpus.mdf.groupby(['category', 'ntriples', 'eid', 'mtext']).size()
(mtriple_count_per_entry > 1).sum()

0

## Is there any modified triple duplicated inter entry?

In [196]:
mtriples_count = corpus.mdf.groupby(['mtext', 'category', 'ntriples']).size()

top_5 = mtriples_count.sum(level=[0]).nlargest(5)

mtriples_count.loc[(top_5.index, slice(None), slice(None))].unstack(level=2, fill_value=0).sort_index(level=0)

Unnamed: 0_level_0,ntriples,1,2,3,4,5,6,7
mtext,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Acharya_Institute_of_Technology | affiliation | Visvesvaraya_Technological_University,University,1,3,8,15,15,21,17
English_language | spokenIn | Great_Britain,WrittenWork,1,6,8,39,57,0,0
"United_States | capital | Washington,_D.C.",Building,1,0,2,3,5,0,0
"United_States | capital | Washington,_D.C.",City,1,0,0,0,0,0,0
"United_States | capital | Washington,_D.C.",Food,1,1,5,10,16,0,0
"United_States | capital | Washington,_D.C.",WrittenWork,1,3,10,13,26,0,0
United_States | language | English_language,Building,1,1,2,3,8,0,0
United_States | language | English_language,City,1,0,0,0,0,0,0
United_States | language | English_language,Food,1,1,0,0,7,0,0
United_States | language | English_language,WrittenWork,0,4,16,21,22,0,0


## Are entries with shared triples lexicalized the same way?

There are some patterns...

In [218]:
corpus.sample_w_mtext(mtext='Acharya_Institute_of_Technology | affiliation | Visvesvaraya_Technological_University').display()

Unnamed: 0,m_subject,m_predicate,m_object
0,Acharya_Institute_of_Technology,affiliation,Visvesvaraya_Technological_University
1,Acharya_Institute_of_Technology,president,"""B.M. Reddy"""
2,Acharya_Institute_of_Technology,city,Bangalore
3,Acharya_Institute_of_Technology,established,2000
4,Acharya_Institute_of_Technology,country,"""India"""
5,Acharya_Institute_of_Technology,state,Karnataka


Unnamed: 0,ltext
0,"The Acharya Institute of Technology in Bangalore, Karnataka, India was established in 2000. Its president is B.M. Reddy and it is affiliated with the Visvesvaraya Technological University."
1,The city of Bangalore in the state of Karnataka in India is the location of the Acharya Institute of Technology. The President of the Institute which was established in 2000 and is affiliated with Visvesvaraya Technological University is B M Reddy .
2,"Established in 2000, the Acharya Institute of Technology is in Bangalore, Karnataka India. The institute's president is B.M. Reddy and it is affiliated with the Visvesvaraya Technological University."


In [219]:
corpus.sample_w_mtext(mtext='Acharya_Institute_of_Technology | affiliation | Visvesvaraya_Technological_University').display()

Unnamed: 0,m_subject,m_predicate,m_object
0,Acharya_Institute_of_Technology,affiliation,Visvesvaraya_Technological_University
1,Acharya_Institute_of_Technology,was given the 'Technical Campus' status by,All_India_Council_for_Technical_Education
2,All_India_Council_for_Technical_Education,location,Mumbai
3,Karnataka,has to its northeast,Telangana
4,Acharya_Institute_of_Technology,city,Bangalore
5,Acharya_Institute_of_Technology,state,Karnataka


Unnamed: 0,ltext
0,"The Acharya Institute of Technology in Bangalore, Karnataka is affiliated to the Visvesvaraya Technological University. It was given the ""Technical Campus"" status by the All India Council for Technical Education (located in Mumbai). Karnataka has Telangana to its northeast."
1,"The city of Bangalore in the state of Karnataka, India which lies southwest of Telangana is the location of the Acharya Institute of Technology ffiliated to the Visvesvaraya Technological University. The Institute was given Technical Campus status by the All India Council for Technical Education based in Mumbai."
2,The Acharya Institute of Technolgoy is located in the state of Karnataka which is positioned with Telangana to the northeast. The Institute is in the city of Bangalore and has strong connections with the Visvesvaraya Technological University. Along with these connections the Institute was also given the 'Technical Campus' status by the All India Council for Technical Education which is based in Mumbai.


# Modified triple analysis

In [229]:
print("There are {} triples".format(corpus.mdf.shape[0]))

There are 20458 triples


## Number of distinct subjects, predicates, objects

In [247]:
print("Subjects:\t{}\nPredicates:\t{}\nObjects:\t{}".format(\
                                                         corpus.mdf.m_subject.unique().shape[0],
                                                         corpus.mdf.m_predicate.unique().shape[0],
                                                         corpus.mdf.m_object.unique().shape[0]))

Subjects:	430
Predicates:	246
Objects:	1619


In [286]:
s_size = corpus.mdf.groupby('m_subject').size()
p_size = corpus.mdf.groupby('m_predicate').size()
o_size = corpus.mdf.groupby('m_object').size()

In [287]:
pd.concat([s_size.describe(), p_size.describe(), o_size.describe()], axis=1)

Unnamed: 0,0,1,2
count,430.0,246.0,1619.0
mean,47.576744,83.162602,12.636195
std,70.587527,203.773158,27.2297
min,1.0,1.0,1.0
25%,6.0,7.0,3.0
50%,19.5,22.0,7.0
75%,68.0,72.5,14.0
max,692.0,2150.0,858.0


## Most frequent subjects, predicates, objects

In [297]:
s_size.nlargest(5)

m_subject
United_States                       692
Acharya_Institute_of_Technology     452
Elliot_See                          416
Alan_Shepard                        399
Alan_Bean                           385
dtype: int64

In [298]:
p_size.nlargest(5)

m_predicate
 country        2150
 location       1363
 leaderName     1227
 ingredient      652
 isPartOf        573
dtype: int64

In [299]:
o_size.nlargest(5)

m_object
 United_States       858
 English_language    264
 Indonesia           166
 Spain               150
 Italy               122
dtype: int64

# Is tripleset connected?

## Let's check a sample

In [380]:
sample = corpus.sample("University", 3)
sample.display()

Unnamed: 0,m_subject,m_predicate,m_object
0,Accademia_di_Architettura_di_Mendrisio,dean,Mario_Botta
1,Accademia_di_Architettura_di_Mendrisio,city,Mendrisio
2,Accademia_di_Architettura_di_Mendrisio,established,1996


Unnamed: 0,ltext
0,The Accademia di Architettura di Mendrisio was established in Mendrisio in 1996. Its dean is Mario Botta.
1,Mario Botta is Dean of the Accademia di Architettura di Mendrisio which is located in the city of Mendrisio and was established in 1996.
2,The dean of the Accademia di Architettura di Mendrisio is Mario Botta and the university was established in 1996.


In [382]:
import networkx as nx

g = nx.from_pandas_edgelist(sample.mtriples, 'm_subject', 'm_object', 'm_predicate')

nx.is_connected(g)

True

## Let's check all tripleset

In [419]:
is_connected = []

graphs = {}

for name, entry_group in corpus.mdf.groupby(['category', 'ntriples', 'eid']):
    
    entry_group.reset_index(inplace=True)
    
    d_g = nx.from_pandas_edgelist(entry_group, 'm_subject', 'm_object', 'm_predicate', create_using=nx.DiGraph())
    u_g = nx.Graph(d_g)
    
    graphs[name] = d_g
    
    is_connected.append(list(name) + [nx.is_connected(u_g)])
    
is_connected_df = pd.DataFrame(is_connected, columns=['category', 'ntriples', 'eid', 'is_connected'])

In [402]:
is_connected_df.is_connected.value_counts()

True     6930
False      10
Name: is_connected, dtype: int64

### Let's have a look at a disconnected tripleset

In [412]:
sample = is_connected_df[~is_connected_df.is_connected].sample()
sample

Unnamed: 0,category,ntriples,eid,is_connected
970,Airport,5,Id173,False


In [414]:
entry = corpus.get('Airport', 5, 'Id173')
entry.display()

Unnamed: 0,m_subject,m_predicate,m_object
0,Belgium,leaderName,Philippe_of_Belgium
1,Antwerp_International_Airport,cityServed,Antwerp
2,Flemish_Region,leaderName,Flemish_Government
3,Flemish_Region,country,Belgium
4,Flemish_Government,jurisdiction,Flemish_Region


Unnamed: 0,ltext
0,"Led by the Flemish government, the Flemish region is in the country of Belgium where Philippe of Belgium is the leader. Also in Belgium is Antwerp, which is served by Antwerp International airport."
1,The Flemish Government leads and has jurisdiction of the Flemish region. The Flemish region is in the country of Belgium where Philippe of Belgium is the leader and the city of Antwerp is served by Antwerp International Airport.
2,Antwerp International Airport serves the city of Antwerp in Belgium which is led by Philippe. The jurisdiction of the Flemish Government in Belgium is the Flemish Region.


# Is it possible to determine a 'root' entity?

In [420]:
from pprint import pprint

g = graphs[('Airport', 5, 'Id173')]

pprint(nx.to_dict_of_dicts(g))

{'Antwerp': {},
 'Antwerp_International_Airport': {'Antwerp': {'m_predicate': 'cityServed'}},
 'Belgium': {'Philippe_of_Belgium': {'m_predicate': 'leaderName'}},
 'Flemish_Government': {'Flemish_Region': {'m_predicate': 'jurisdiction'}},
 'Flemish_Region': {'Belgium': {'m_predicate': 'country'},
                    'Flemish_Government': {'m_predicate': 'leaderName'}},
 'Philippe_of_Belgium': {}}


In [421]:
sorted(g.degree, key=lambda x: x[1], reverse=True)

[('Flemish_Region', 3),
 ('Belgium', 2),
 ('Flemish_Government', 2),
 ('Philippe_of_Belgium', 1),
 ('Antwerp_International_Airport', 1),
 ('Antwerp', 1)]

# Max number of reference texts

In [422]:
corpus.ldf.groupby(['category', 'ntriples', 'eid']).size().nlargest(5)

category   ntriples  eid 
Astronaut  1         Id37    8
Monument   1         Id23    8
Astronaut  1         Id14    7
                     Id20    7
                     Id34    7
dtype: int64

In [437]:
a = corpus.get('Astronaut', 1, 'Id37')
a.display()

Unnamed: 0,m_subject,m_predicate,m_object
0,Buzz_Aldrin,dateOfRetirement,"""1971-07-01"""


Unnamed: 0,ltext
0,Buzz Aldrin retired on the 7th of January 1971.
1,Buzz Aldrin date of retirement was 1971/07/01.
2,"Buzz Aldrin's retirement date is ""1971-07-01""."
3,"Buzz Aldrin retired on July 1st, 1971."
4,"Buzz Aldrin retired on July 1, 1971."
5,Buzz Aldrin retired on the 1st of July 1971.
6,Buzz Aldrin retired on 1971-07-01.
7,"Buzz Aldrin retired on Jul 1, 1971."


# Vocabulary size

In [435]:
s_set = set(corpus.mdf.m_subject.unique())
p_set = set(corpus.mdf.m_predicate.unique())
o_set = set(corpus.mdf.m_object.unique())

len(s_set.union(p_set).union(o_set))

2001

## Is there any predicate used as subject or object?

In [436]:
p_set.intersection(s_set.union(o_set))

set()