# Overview of Downloaded Europeana Meta-Data

In [1]:
import numpy as np
import pandas as pd

import os

# Loading the Meta-Data

The meta-data used in this evaluation has been downloaded from Europeana using the Europeana-API.

In [2]:
metadata = pd.read_pickle("E:/Data/MIR/EU_SOUNDS_FEATURES/aggregated_metadata.p")

# Meta-Data Analysis

The following attributes have been extracted from the json-formatted API-response:

In [3]:
pd.DataFrame(metadata.columns.tolist())

Unnamed: 0,0
0,title
1,aboutaggregations.0.about
2,aggregations.0.edmDataProvider.def
3,aggregations.0.edmProvider.def
4,aggregations.0.edmRights.def
5,aggregations.0.webResources.0.about
6,aggregations.0.edmIsShownBy
7,proxies.0.dctermsCreated.def
8,proxies.0.dctermsExtent.def
9,proxies.0.dctermsMedium.def


Number of attributes used:

In [4]:
metadata.shape[1]

26

**Number of data instances available**

In [5]:
metadata.shape[0]

389120

Example Entry:

In [6]:
metadata.iloc[110,:]

title                                                                      SANTA GIOVANNA
aboutaggregations.0.about                                                            None
aggregations.0.edmDataProvider.def                              Regione Marche / SchedeBA
aggregations.0.edmProvider.def                                              CulturaItalia
aggregations.0.edmRights.def                         http://www.europeana.eu/rights/rr-r/
aggregations.0.webResources.0.about     http://www.culturaitalia.it/opencms/viewItem.j...
aggregations.0.edmIsShownBy             http://sirpac.cultura.marche.it/sirpacintraweb...
proxies.0.dctermsCreated.def                                                         None
proxies.0.dctermsExtent.def             45m - 60m (I), 15m - 30m (II), altro: 59 m (II...
proxies.0.dctermsMedium.def                                                          None
proxies.0.dctermsSpatial.def            Centro Studi Drammaturgici Internazionali 'Fra...
proxies.0.

## Collections

### Data Providers

**Number of Providers**

In [7]:
len(metadata["aggregations.0.edmDataProvider.def"].unique().tolist())

1160

Data Providers are institutions (e.g. ONB, BNF, etc)

In [8]:
pd.DataFrame(metadata["aggregations.0.edmDataProvider.def"].unique())

Unnamed: 0,0
0,Regione Marche / SchedeBA
1,WebFolk Bulgaria
2,Bibliothèque Medem - Maison de la Culture Yiddish
3,Centre Français des Musiques Juives
4,Jewish Museum London
5,Bibliothek der Universität für Musik und darst...
6,"Békés Megyei Tudásház es Könyvtár, HU"
7,Biblioteca Pública Municipal do Porto
8,"ETI BioInformatics, Leiden, the Netherlands"
9,Zoological Research Museum Koenig


## Providers / Aggregators

Can be aggregators, projects, thematic aggregators, etc. Use data from data-providers and try to convert them into a common format (edm).

Number of data items per Aggregator:

In [9]:
items_per_provider = metadata.groupby(["aggregations.0.edmProvider.def"])["aggregations.0.edmProvider.def"].agg(['count'])
items_per_provider

Unnamed: 0_level_0,count
aggregations.0.edmProvider.def,Unnamed: 1_level_1
Arts Council Norway,235
Athena,4633
AthenaPlus,7826
"Békés Megyei Tudásház es Könyvtár, HU",24
CulturaItalia,2809
DISMARC,325388
DK-National Aggregation Service,2
Deutsche Digitale Bibliothek,1661
Europeana 1914-1918,59
Europeana Inside Sweden,190


**Provider statistics**

average number of items per provider

In [10]:
items_per_provider.mean()

count    14492.576923
dtype: float64

standard deviation

In [11]:
items_per_provider.std()

count    63519.269489
dtype: float64

Median - more robust to inbalanced distribution => DISMARC

In [12]:
items_per_provider.median()

count    238.5
dtype: float64

**Number of data providers per Aggregator**

In [13]:
group = metadata.groupby(["aggregations.0.edmProvider.def", 
                          "aggregations.0.edmDataProvider.def"])

d2 = pd.DataFrame(group["aggregations.0.edmProvider.def"].agg(['count']).index.tolist(), 
                  columns=["aggregations.0.edmProvider.def", "aggregations.0.edmDataProvider.def"])

d2.groupby("aggregations.0.edmProvider.def").agg(['count'])

Unnamed: 0_level_0,aggregations.0.edmDataProvider.def
Unnamed: 0_level_1,count
aggregations.0.edmProvider.def,Unnamed: 1_level_2
Arts Council Norway,24
Athena,1
AthenaPlus,4
"Békés Megyei Tudásház es Könyvtár, HU",1
CulturaItalia,1
DISMARC,1081
DK-National Aggregation Service,1
Deutsche Digitale Bibliothek,1
Europeana 1914-1918,1
Europeana Inside Sweden,1


# Collections

Number of data items per collection

In [14]:
collections = metadata.groupby(["europeanaCollectionName"])["europeanaCollectionName"].agg(['count'])

Number of Collections

In [15]:
collections.shape[0]

55

Collections per Aggregator

In [16]:
group = metadata.groupby(["aggregations.0.edmProvider.def", "europeanaCollectionName"
                          ])

d2 = pd.DataFrame(group["aggregations.0.edmProvider.def"].agg(['count']).index.tolist(), 
                  columns=["aggregations.0.edmProvider.def","europeanaCollectionName"])

d2.groupby("aggregations.0.edmProvider.def").agg(['count'])

Unnamed: 0_level_0,europeanaCollectionName
Unnamed: 0_level_1,count
aggregations.0.edmProvider.def,Unnamed: 1_level_2
Arts Council Norway,1
Athena,1
AthenaPlus,4
"Békés Megyei Tudásház es Könyvtár, HU",1
CulturaItalia,1
DISMARC,1
DK-National Aggregation Service,1
Deutsche Digitale Bibliothek,1
Europeana 1914-1918,1
Europeana Inside Sweden,1


Collection names per Aggregator

In [17]:
group = metadata.groupby(["aggregations.0.edmProvider.def", "europeanaCollectionName"])
group["aggregations.0.edmProvider.def"].agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
aggregations.0.edmProvider.def,europeanaCollectionName,Unnamed: 2_level_1
Arts Council Norway,2022611_Ag_NO_ELocal_difo,235
Athena,08506_Ag_EU_ATHENA_Central_Library_of_the_Bulgaria,4633
AthenaPlus,2048008_Ag_EU_AthenaPlus_AjuntamentGirona,2859
AthenaPlus,2048024_Ag_EU_AthenaPlus_CentraleRoma,4937
AthenaPlus,2048048_Ag_EU_AthenaPlus_Merab,28
AthenaPlus,2048087_Ag_EU_AthenaPlus_CollectionsTrust,2
"Békés Megyei Tudásház es Könyvtár, HU",09408_Ag_HU_ELocal,24
CulturaItalia,07602_Ag_IT_Culturalitalia_RegioneMarche,2809
DISMARC,2023601_Ag_DE_DISMARC,325388
DK-National Aggregation Service,2020902_Ag_DK_Elocal,2


In [18]:
d2 = pd.DataFrame(group["aggregations.0.edmProvider.def"].agg(['count']).index.tolist(), 
                  columns=["aggregations.0.edmProvider.def", "aggregations.0.edmDataProvider.def"])

d2.groupby("aggregations.0.edmProvider.def").agg(['count'])

Unnamed: 0_level_0,aggregations.0.edmDataProvider.def
Unnamed: 0_level_1,count
aggregations.0.edmProvider.def,Unnamed: 1_level_2
Arts Council Norway,1
Athena,1
AthenaPlus,4
"Békés Megyei Tudásház es Könyvtár, HU",1
CulturaItalia,1
DISMARC,1
DK-National Aggregation Service,1
Deutsche Digitale Bibliothek,1
Europeana 1914-1918,1
Europeana Inside Sweden,1


In [4]:
collection = metadata[metadata["europeanaCollectionName"] == "2048401_Ag_DE_DDB_eseslub"]

In [5]:
s = collection["proxies.0.dcSubject.def"].apply(lambda x: pd.Series(x.split())).stack()
words = [v.replace(",", "") for v in s.values]


u, p = np.unique(words,return_inverse=True)
counts = np.bincount(p)
pd.DataFrame({"words":u, "counts":counts}).sort("counts", ascending=False)[:15]



Unnamed: 0,counts,words
102,1749,Schellackplatte
79,1019,Oper
60,397,Lied
5,71,Arie
80,54,Operette
19,24,Duett
13,24,Chor
99,22,Schallplatte
75,19,Musik
53,13,Konzert


In [9]:
s = collection["proxies.0.dcDescription.def"].apply(lambda x: pd.Series(x.split())).stack()
words = [v.replace(",", "") for v in s.values]
s = collection["proxies.0.dcSubject.def"].apply(lambda x: pd.Series(x.split())).stack()
words.extend([v.replace(",", "") for v in s.values])
s = collection["title"].apply(lambda x: pd.Series(x.split())).stack()
words.extend([v.replace(",", "") for v in s.values])

u, p = np.unique(words,return_inverse=True)
counts = np.bincount(p)
pd.DataFrame({"words":u, "counts":counts}).sort("counts", ascending=False)[:15]





Unnamed: 0,counts,words
1264,3410,/
4356,1749,Schellackplatte
1719,1432,Auswahl
1493,1377,:
3899,1068,Oper
3387,491,Lied
7578,469,von
5572,449,aus
5862,365,der
2129,349,Die


In [67]:
s = collection["title"]#.apply(lambda x: pd.Series(x.split())).stack()
words = [v.replace(",", "") for v in s.values]

u, p = np.unique(words,return_inverse=True)
counts = np.bincount(p)
pd.DataFrame({"words":u, "counts":counts}).sort("counts", ascending=False)[:15]

Unnamed: 0,counts,words
78,29,Carmen / Auswahl
308,22,Faust / Auswahl
613,19,Lohengrin / Auswahl
718,18,Pagliacci / Auswahl
415,15,Il trovatore / Auswahl
209,14,Die Walküre / Auswahl
872,14,Tannhäuser / Auswahl
504,14,La bohème / Auswahl
870,13,Tannhäuser &lt;O du mein holder Abendstern&gt;
127,13,Der Freischütz / Auswahl


In [3]:
s = collection["proxies.0.dcDescription.def"].apply(lambda x: pd.Series(x.split())).stack()
words = [v.replace(",", "") for v in s.values if v != None]

u, p = np.unique(words,return_inverse=True)
counts = np.bincount(p)
pd.DataFrame({"words":u, "counts":counts}).sort("counts", ascending=False)[:35]

NameError: name 'collection' is not defined

In [73]:
collection.iloc[0]

title                                                                Don Carlos / Auswahl
aboutaggregations.0.about                                                            None
aggregations.0.edmDataProvider.def      Sächsische Landesbibliothek - Staats- und Univ...
aggregations.0.edmProvider.def                               Deutsche Digitale Bibliothek
aggregations.0.edmRights.def                         http://www.europeana.eu/rights/rr-f/
aggregations.0.webResources.0.about     http://mediathek.slub-dresden.de/ton70911629.html
aggregations.0.edmIsShownBy             http://media.slub-dresden.de/fon/snp/a/007899/...
proxies.0.dctermsCreated.def                                                         None
proxies.0.dctermsExtent.def                                                          None
proxies.0.dctermsMedium.def                                                          None
proxies.0.dctermsSpatial.def                               http://d-nb.info/gnd/4012995-0
proxies.0.