In [3]:
import numpy as np
import pandas as pd
from rdflib import Graph, ConjunctiveGraph, URIRef, Namespace, Literal

INFO:rdflib:RDFLib Version: 4.2.1


In [7]:
class CSVWStats():
    
    def __init__(self, graph):
        self.g = graph
        self.sg = Graph()
        self.variables = []

        self._init_data_frame()
        self._init_stats_graph()
        
    def _init_data_frame(self):
        '''
        Initializes pandas dataframe with dict of dicts {p: {s: o}}
        '''
        self.data = {}
        self.index = []
        
        qres = self.g.query("""
            PREFIX qb: <http://purl.org/linked-data/cube#>
            SELECT ?s ?p ?o WHERE {GRAPH ?g {?s ?p ?o. ?p a qb:DimensionProperty . }}
            """")
        for row in qres:
            if row[1] not in self.variables:
                self.variables.append(row[1]) # column names
            if row[0] not in self.index:
                self.index.append(row[0]) # row names
        
        self.variables = self.variables
        self.index = self.index
        
        for i in self.variables:
            for j in self.index:
                self.data[i] = {j : np.nan}
            
        for row in qres:
            try:
                self.data[row[1]][row[0]] = float(row[2])
            except ValueError:
                self.data[row[1]][row[0]] = row[2]
        
        self.df = pd.DataFrame(self.data, index=self.index)
        
    def _init_stats_graph(self):
        '''
        Initializes the stats graph with data from pandas df methods
        '''
        stats_uri = URIRef('http://example.org/stats/')
        stats = Namespace(stats_uri)
        
        for var in self.variables:
            try:
                self.sg.add((var, stats['count'], Literal(self.df.describe()[var]['count'])))
                self.sg.add((var, stats['mean'], Literal(self.df.describe()[var]['mean'])))
                self.sg.add((var, stats['std'], Literal(self.df.describe()[var]['std'])))
                self.sg.add((var, stats['min'], Literal(self.df.describe()[var]['min'])))
                self.sg.add((var, stats['p25'], Literal(self.df.describe()[var]['25%'])))
                self.sg.add((var, stats['p50'], Literal(self.df.describe()[var]['50%'])))
                self.sg.add((var, stats['p75'], Literal(self.df.describe()[var]['75%'])))
                self.sg.add((var, stats['max'], Literal(self.df.describe()[var]['max'])))
            except KeyError:
                # The dictionary of describe() has no stats for factors, what to do?
                pass
            
        self.sg.bind('stats', stats)
        
    def serialize(self):
        '''
        Serializes the current self.gs statistics graph
        '''
        return self.sg.serialize(format='nt')

        

SyntaxError: EOL while scanning string literal (<ipython-input-7-b51413fd2025>, line 21)

In [5]:
g = ConjunctiveGraph()
# g = Graph()
g.parse('/home/amp/src/wp4-converters/src/tests/utrecht_1829_clean_01.csv.nq', format='nquads')
s = CSVWStats(g)
with open('stats.n3', 'w') as f:
    f.write(s.serialize())

In [6]:
s.df

Unnamed: 0,http://data.socialhistory.org/resource/,http://data.socialhistory.org/resource/achternaam,http://data.socialhistory.org/resource/achtervoegsel,http://data.socialhistory.org/resource/beroep,http://data.socialhistory.org/resource/beroep_hb,http://data.socialhistory.org/resource/geboorteland,http://data.socialhistory.org/resource/geboorteland_hb,http://data.socialhistory.org/resource/geboorteplaats,http://data.socialhistory.org/resource/geboorteplaats_hb,http://data.socialhistory.org/resource/geboorteregio,...,http://www.w3.org/ns/csvw#encoding,http://www.w3.org/ns/csvw#name,http://www.w3.org/ns/csvw#primaryKey,http://www.w3.org/ns/csvw#quoteChar,http://www.w3.org/ns/csvw#tableSchema,http://www.w3.org/ns/csvw#title,http://www.w3.org/ns/csvw#url,http://www.w3.org/ns/prov#generatedAtTime,http://www.w3.org/ns/prov#wasDerivedFrom,http://www.w3.org/ns/prov#wasGeneratedBy
http://data.socialhistory.org/resource/utrecht_1829_clean_01/provenance/d71c71af/2016-12-05T10:14,,,,,,,,,,,...,,,,,,,,,,
http://data.socialhistory.org/resource/utrecht_1829_clean_01/nanopublication/d71c71af/2016-12-05T10:14,,,,,,,,,,,...,,,,,,,,2016-12-05T10:14,,https://github.com/CLARIAH/wp4-converters
http://data.socialhistory.org/resource/d71c71afb5a2dc4701627124ef91f01d1dfeff3a,,,,,,,,,,,...,,,,,,,,,,
http://data.socialhistory.org/resource/utrecht_1829_clean_01/assertion/d71c71af/2016-12-05T10:14,,,,,,,,,,,...,,,,,,,,2016-12-05T10:14,http://data.socialhistory.org/resource/utrecht...,
http://data.socialhistory.org/resource/utrecht_1829_clean_01/pubinfo/d71c71af/2016-12-05T10:14,,,,,,,,,,,...,,,,,,,,,,
http://data.socialhistory.org/resource/301,302.0,walker,,zb,kruidenier,engeland,nederland,londen,utrecht,,...,,,,,,,,,,
http://data.socialhistory.org/resource/749,750.0,"erps, van",,kamerknegt,gouverneur dezer provincie,belgië,belgië,walhem,antwerpen,bij mechelen,...,,,,,,,,,,
http://data.socialhistory.org/resource/1022,1023.0,andel,,bestelder,letterzetter,duitsland,nederland,hanig,utrecht,bij crenerak in de midden palz,...,,,,,,,,,,
http://data.socialhistory.org/resource/106,107.0,lietze,,schoenmaker,schoenmaker,duitsland,duitsland,potsdam,potsdam,,...,,,,,,,,,,
http://data.socialhistory.org/resource/190,191.0,gödde,91,brouwersknecht,boekhouder in de bierbrouwerij de borg,duitsland,nederland,luddinghausen,utrecht,,...,,,,,,,,,,


In [52]:
s.df.describe(URIRef('http://data.socialhistory.org/resource/AGE'))


TypeError: ufunc 'divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [34]:
by_age = s.df.groupby(URIRef('http://data.socialhistory.org/resource/AGE')).describe()
by_age

Unnamed: 0_level_0,Unnamed: 1_level_0,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCTYGB,http://data.socialhistory.org/resource/CNTRY,http://data.socialhistory.org/resource/COUNTYGB,http://data.socialhistory.org/resource/GQ,http://data.socialhistory.org/resource/LABFORCE,http://data.socialhistory.org/resource/MARST,http://data.socialhistory.org/resource/NCOUPLES,http://data.socialhistory.org/resource/NFAMS,http://data.socialhistory.org/resource/NFATHERS,...,http://data.socialhistory.org/resource/ORDERGB,http://data.socialhistory.org/resource/PERNUM,http://data.socialhistory.org/resource/PERWT,http://data.socialhistory.org/resource/RELATE,http://data.socialhistory.org/resource/SAMPLE,http://data.socialhistory.org/resource/SERIAL,http://data.socialhistory.org/resource/SERVANTS,http://data.socialhistory.org/resource/SEX,http://data.socialhistory.org/resource/URBAN,http://data.socialhistory.org/resource/YEAR
http://data.socialhistory.org/resource/AGE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0.0,count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
0.0,mean,,,,,,,,,,,...,,,,,,,,,,1881.0
0.0,std,,,,,,,,,,,...,,,,,,,,,,
0.0,min,,,,,,,,,,,...,,,,,,,,,,1881.0
0.0,25%,,,,,,,,,,,...,,,,,,,,,,1881.0
0.0,50%,,,,,,,,,,,...,,,,,,,,,,1881.0
0.0,75%,,,,,,,,,,,...,,,,,,,,,,1881.0
0.0,max,,,,,,,,,,,...,,,,,,,,,,1881.0
1.0,count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,mean,,,,,,,,,,,...,,,,,,,,,,


In [36]:
by_age.unstack(0)

Unnamed: 0_level_0,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCNTRY,http://data.socialhistory.org/resource/BPLCNTRY,...,http://data.socialhistory.org/resource/YEAR,http://data.socialhistory.org/resource/YEAR,http://data.socialhistory.org/resource/YEAR,http://data.socialhistory.org/resource/YEAR,http://data.socialhistory.org/resource/YEAR,http://data.socialhistory.org/resource/YEAR,http://data.socialhistory.org/resource/YEAR,http://data.socialhistory.org/resource/YEAR,http://data.socialhistory.org/resource/YEAR,http://data.socialhistory.org/resource/YEAR
http://data.socialhistory.org/resource/AGE,0.0,1.0,2.0,3.0,4.0,5.0,8.0,12.0,13.0,16.0,...,35.0,41.0,43.0,46.0,49.0,50.0,52.0,60.0,66.0,68.0
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,...,,,,,,,,,,
std,,,,,,,,,,,...,,,,,,,,,,
min,,,,,,,,,,,...,,,,,,,,,,
25%,,,,,,,,,,,...,,,,,,,,,,
50%,,,,,,,,,,,...,,,,,,,,,,
75%,,,,,,,,,,,...,,,,,,,,,,
max,,,,,,,,,,,...,,,,,,,,,,


In [40]:
s.df.describe()[URIRef('http://data.socialhistory.org/resource/AGE')]['count']

33.0

In [22]:
# DataFrame init
d = {'p1' : pd.Series(['male', 'female', 'male'], index=['s1', 's2', 's3']),
     'p2' : pd.Series([4., 5., 6.], index=['s1', 's2', 's3'])}
df = pd.DataFrame(d)
df


Unnamed: 0,p1,p2
s1,male,4.0
s2,female,5.0
s3,male,6.0


In [25]:
# Stats
df.describe()['p1']
                

KeyError: 'p1'