In [1]:
import pandas as pd

pd.set_option('max_colwidth', 1000)

In [11]:
CATEGORIES = ['Airport', 'Astronaut', 'Building', 'City', 'ComicsCharacter', 
              'Food', 'Monument', 'SportsTeam', 'University', 'WrittenWork']

In [47]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

FILEPATH_TEMPLATE = '../data/webnlg2017/challenge_data_train_dev/train/{0}triples/{0}triples_{1}_train_challenge.xml'

class WebNLGDataset(object):
    
    def __init__(self, ntriples, category):
        
        self.ntriples = ntriples
        self.category = category
        self.edf, self.odf, self.mdf, self.ldf = WebNLGDataset._read_files(ntriples, category)
        
    def sample(self, random_state=None):
        
        e = self.edf.sample(random_state=random_state)
        o = self.odf[self.odf.eid == e.eid.values[0]]
        m = self.mdf[self.mdf.eid == e.eid.values[0]]
        l = self.ldf[self.ldf.eid == e.eid.values[0]]
        
        return e, o, m, l
        
    @staticmethod
    def _read_files(ntriples, category):
        
        tree = ET.parse(FILEPATH_TEMPLATE.format(ntriples, category))
        root = tree.getroot()

        entries, otriples, mtriples, lexes = [], [], [], []
        
        for entry in root.iter('entry'):
            
            entry_dict = {
                "category": entry.attrib['category'],
                "eid": entry.attrib['eid'],
                "size": entry.attrib['size'],
                "ntriples": ntriples,
                "category": category
            }
            entries.append(entry_dict)
            
            otriple_dict = [
                {'eid': entry.attrib['eid'],
                 'text': e.text,
                 "ntriples": ntriples,
                 "category": category} for e in entry.find('originaltripleset').findall('otriple')
            ]
            otriples.extend(otriple_dict)
            
            mtriple_dict = [
                {'eid': entry.attrib['eid'],
                 'text': e.text,
                 "ntriples": ntriples,
                 "category": category} for e in entry.find('modifiedtripleset').findall('mtriple')
            ]
            mtriples.extend(mtriple_dict)
            
            lex_dict = [
                {'eid': entry.attrib['eid'],
                 'text': e.text,
                 'comment': e.attrib['comment'],
                 'lid': e.attrib['lid'],
                 "ntriples": ntriples,
                 "category": category} for e in entry.findall('lex')
            ]
            lexes.extend(lex_dict)

        
        otriples_df = pd.DataFrame(otriples)
        mtriples_df = pd.DataFrame(mtriples)
        entries_df = pd.DataFrame(entries)
        lexes_df = pd.DataFrame(lexes)

        return entries_df, otriples_df, mtriples_df, lexes_df

In [62]:
class WebNLGCorpus(object):
    
    def __init__(self):
        
        self.datasets = []
        for category in CATEGORIES:
            
            for ntriplas in range(1, 8):
                
                try:
                    self.datasets.append(WebNLGDataset(ntriplas, category))
                except FileNotFoundError:
                    pass
                
        self.edf = pd.concat([ds.edf for ds in self.datasets])
        self.odf = pd.concat([ds.odf for ds in self.datasets])
        self.mdf = pd.concat([ds.mdf for ds in self.datasets])
        self.ldf = pd.concat([ds.ldf for ds in self.datasets])
        
    def datasets_size(self):
        
        return self.edf.pivot_table(index='category', columns='ntriples', aggfunc='count', margins=True).fillna(0)
        
#     def dataset(self, category, ntriplas):
        
#         return self.datasets[(category, ntriplas)]

# Have a look at a sample

In [63]:
corpus = WebNLGCorpus()

In [64]:
corpus.datasets_size()

Unnamed: 0_level_0,eid,eid,eid,eid,eid,eid,eid,eid,size,size,size,size,size,size,size,size
ntriples,1,2,3,4,5,6,7,All,1,2,3,4,5,6,7,All
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Airport,301.0,193.0,187.0,207.0,202.0,0.0,0.0,1090,301.0,193.0,187.0,207.0,202.0,0.0,0.0,1090
Astronaut,72.0,46.0,64.0,82.0,86.0,90.0,90.0,530,72.0,46.0,64.0,82.0,86.0,90.0,90.0,530
Building,236.0,171.0,203.0,206.0,156.0,0.0,0.0,972,236.0,171.0,203.0,206.0,156.0,0.0,0.0,972
City,243.0,0.0,0.0,0.0,0.0,0.0,0.0,243,243.0,0.0,0.0,0.0,0.0,0.0,0.0,243
ComicsCharacter,98.0,77.0,64.0,35.0,11.0,0.0,0.0,285,98.0,77.0,64.0,35.0,11.0,0.0,0.0,285
Food,272.0,278.0,314.0,323.0,237.0,0.0,0.0,1424,272.0,278.0,314.0,323.0,237.0,0.0,0.0,1424
Monument,38.0,32.0,42.0,48.0,45.0,36.0,26.0,267,38.0,32.0,42.0,48.0,45.0,36.0,26.0,267
SportsTeam,251.0,170.0,170.0,150.0,45.0,0.0,0.0,786,251.0,170.0,170.0,150.0,45.0,0.0,0.0,786
University,58.0,39.0,58.0,73.0,62.0,62.0,54.0,406,58.0,39.0,58.0,73.0,62.0,62.0,54.0,406
WrittenWork,219.0,202.0,248.0,170.0,98.0,0.0,0.0,937,219.0,202.0,248.0,170.0,98.0,0.0,0.0,937


In [69]:
(corpus.ldf.groupby(['category', 'ntriples']).size() / corpus.edf.groupby(['category', 'ntriples']).size())\
.unstack().fillna(0)

ntriples,1,2,3,4,5,6,7
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Airport,2.41196,2.544041,2.657754,2.816425,2.643564,0.0,0.0
Astronaut,4.027778,4.23913,3.203125,1.963415,2.627907,2.344444,2.688889
Building,2.152542,2.637427,2.527094,2.616505,2.352564,0.0,0.0
City,2.506173,0.0,0.0,0.0,0.0,0.0,0.0
ComicsCharacter,2.479592,2.766234,2.59375,2.771429,2.727273,0.0,0.0
Food,2.275735,2.733813,2.671975,2.582043,2.506329,0.0,0.0
Monument,3.947368,3.15625,2.5,2.8125,2.6,2.833333,2.807692
SportsTeam,2.123506,2.929412,2.770588,2.48,2.755556,0.0,0.0
University,3.051724,2.692308,2.603448,2.589041,2.564516,2.822581,2.888889
WrittenWork,2.237443,2.742574,2.78629,2.847059,2.520408,0.0,0.0


In [40]:
comic5 = corpus.dataset('ComicsCharacter', 5)

e, o, m, l = comic5.sample(random_state=10)
m

Unnamed: 0,eid,text
10,Id3,Bananaman | broadcastedBy | BBC
11,Id3,Bananaman | creator | Steve_Bright
12,Id3,Bananaman | starring | Graeme_Garden
13,Id3,"Bananaman | firstAired | ""1983-10-03"""
14,Id3,"Bananaman | lastAired | ""1986-04-15"""


In [41]:
l

Unnamed: 0,comment,eid,lid,text
6,good,Id3,Id1,Bananaman first aired on the BBC 10/03/1983 and its last broadcast was 15th April 1986. The creator of Bananaman is Steve Bright and the show stars Graeme Garden.
7,good,Id3,Id2,"Bananaman first aired on the BBC on October 3rd, 1983 and broadcast its last episode on April 15th, 1986. It was created by Steve Bright and stars Graeme Garden."


# Statistics

In [42]:
corpus.statistics

ntriplas,1,2,3,4,5,6,7
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Airport,301.0,193.0,187.0,207.0,202.0,0.0,0.0
Astronaut,72.0,46.0,64.0,82.0,86.0,90.0,90.0
Building,236.0,171.0,203.0,206.0,156.0,0.0,0.0
City,243.0,0.0,0.0,0.0,0.0,0.0,0.0
ComicsCharacter,98.0,77.0,64.0,35.0,11.0,0.0,0.0
Food,272.0,278.0,314.0,323.0,237.0,0.0,0.0
Monument,38.0,32.0,42.0,48.0,45.0,36.0,26.0
SportsTeam,251.0,170.0,170.0,150.0,45.0,0.0,0.0
University,58.0,39.0,58.0,73.0,62.0,62.0,54.0
WrittenWork,219.0,202.0,248.0,170.0,98.0,0.0,0.0


# Does it have duplicates?

In [9]:
comic5.mdf.text.value_counts()

Bananaman | firstAired | "1983-10-03"               7
Bananaman | lastAired | "1986-04-15"                7
Bananaman | creator | Steve_Bright                  5
Bananaman | broadcastedBy | BBC                     5
Baymax | creator | Steven_T._Seagle                 4
Duncan_Rouleau | nationality | Americans            4
Baymax | creator | Duncan_Rouleau                   4
Baymax | series | Big_Hero_6_(film)                 4
Bananaman | starring | Graeme_Garden                3
Bananaman | creator | John_Geering                  2
Bananaman | broadcastedBy | "STV"                   2
Bananaman | starring | Jill_Shilling                2
Big_Hero_6_(film) | starring | Damon_Wayans,_Jr.    1
Big_Hero_6_(film) | starring | Ryan_Potter          1
Bananaman | starring | Bill_Oddie                   1
Bananaman | starring | Tim_Brooke-Taylor            1
Big_Hero_6_(film) | starring | Jamie_Chung          1
Big_Hero_6_(film) | starring | Maya_Rudolph         1
Name: text, dtype: int64

In [10]:
comic5.mdf[comic5.mdf.text == 'Bananaman | firstAired | "1983-10-03"']

Unnamed: 0,eid,text
3,Id1,"Bananaman | firstAired | ""1983-10-03"""
8,Id2,"Bananaman | firstAired | ""1983-10-03"""
13,Id3,"Bananaman | firstAired | ""1983-10-03"""
18,Id4,"Bananaman | firstAired | ""1983-10-03"""
22,Id5,"Bananaman | firstAired | ""1983-10-03"""
27,Id6,"Bananaman | firstAired | ""1983-10-03"""
33,Id7,"Bananaman | firstAired | ""1983-10-03"""
