In [1]:
from collections import Counter
from pandas import DataFrame
from polymatheia.data.reader import LocalReader
from polymatheia.data.writer import PandasDFWriter
from polymatheia.filter import RecordsFilter
from polymatheia.transform import RecordsTransform
from scipy import stats

In [2]:
reader = LocalReader('europeana_test')
mapping = ('parallel', ('copy', 'id', 'id'),
                       ('copy', 'lang', 'dcLanguage[0]'),
                       ('custom', 'title_tokens', lambda record: len(record.title[0].split())),
                       ('copy', 'completeness', 'europeanaCompleteness'),
                       ('copy', 'type', 'type'))
transformed = RecordsTransform(reader, mapping)
fltr = ('and', ('exists', ['lang']),
               ('neq', ['lang'], 'mul'),
               ('lt', ['title_tokens'], 30))
filtered = RecordsFilter(transformed, fltr)
df = PandasDFWriter().write(filtered)

In [3]:
df

Unnamed: 0,id,lang,title_tokens,completeness,type
0,/9200386/BibliographicResource_3000044752173,de,13,9,TEXT
1,/0940434/_nnbZnb3,la,15,0,TEXT
2,/9200332/ABO__2BZ22634790X,ger,6,5,TEXT
3,/2059210/data_sounds_http___imslp_org_wiki_5_L...,deu,9,0,IMAGE
4,/2048609/data_item_uber_dingler_article_pj199_...,de,12,5,TEXT
...,...,...,...,...,...
759,/9200332/ABO__2BZ226348307,ger,6,5,TEXT
760,/9200386/BibliographicResource_3000045010262,de,4,7,TEXT
761,/9200386/BibliographicResource_3000045248460,de,10,8,TEXT
762,/9200386/BibliographicResource_3000135531152,de,5,10,TEXT


In [4]:
Counter(df['lang']).most_common()

[('de', 569),
 ('ger', 94),
 ('hu', 17),
 ('und', 12),
 ('deu', 9),
 ('fre', 9),
 ('pl', 8),
 ('Deutsch', 5),
 ('sv', 5),
 ('la', 4),
 ('cat', 4),
 ('pol', 4),
 ('da', 4),
 ('nl', 3),
 ('swe', 3),
 ('lat', 3),
 ('es', 2),
 ('en', 2),
 ('hun', 2),
 ('ita', 1),
 ('et', 1),
 ('fr', 1),
 ('it', 1),
 ('zxx', 1)]

In [5]:
def map_language(record):
    if record.lang == 'ger' or record.lang == 'deu' or record.lang == 'Deutsch':
        return 'de'
    elif record.lang == 'hun':
        return 'hu'
    elif record.lang == 'swe':
        return 'sv'
    elif record.lang == 'pol':
        return 'pl'
    elif record.lang == 'fre':
        return 'fr'
    elif record.lang == 'cat':
        return 'ca'
    elif record.lang == 'lat':
        return 'la'
    elif record.lang == 'ita':
        return 'it'
    return record.lang

reader = LocalReader('europeana_test')
mapping = ('parallel', ('copy', 'id', 'id'),
                       ('sequence', ('copy', 'lang', 'dcLanguage[0]'),
                                    ('custom', 'lang', map_language)),
                       ('custom', 'title_tokens', lambda record: len(record.title[0].split())),
                       ('copy', 'completeness', 'europeanaCompleteness'),
                       ('copy', 'type', 'type'))
transformed = RecordsTransform(reader, mapping)
fltr = ('and', ('exists', ['lang']),
               ('neq', ['lang'], 'mul'),
               ('neq', ['lang'], 'zxx'),
               ('neq', ['lang'], 'und'),
               ('lt', ['title_tokens'], 30))
filtered = RecordsFilter(transformed, fltr)
df = PandasDFWriter().write(filtered)

In [6]:
df

Unnamed: 0,id,lang,title_tokens,completeness,type
0,/9200386/BibliographicResource_3000044752173,de,13,9,TEXT
1,/0940434/_nnbZnb3,la,15,0,TEXT
2,/9200332/ABO__2BZ22634790X,de,6,5,TEXT
3,/2059210/data_sounds_http___imslp_org_wiki_5_L...,de,9,0,IMAGE
4,/2048609/data_item_uber_dingler_article_pj199_...,de,12,5,TEXT
...,...,...,...,...,...
746,/9200332/ABO__2BZ226348307,de,6,5,TEXT
747,/9200386/BibliographicResource_3000045010262,de,4,7,TEXT
748,/9200386/BibliographicResource_3000045248460,de,10,8,TEXT
749,/9200386/BibliographicResource_3000135531152,de,5,10,TEXT


In [8]:
def map_language(record):
    if record.lang == 'ger' or record.lang == 'deu' or record.lang == 'Deutsch':
        return 'de'
    elif record.lang == 'hun':
        return 'hu'
    elif record.lang == 'swe':
        return 'sv'
    elif record.lang == 'pol':
        return 'pl'
    elif record.lang == 'fre':
        return 'fr'
    elif record.lang == 'cat':
        return 'ca'
    elif record.lang == 'lat':
        return 'la'
    elif record.lang == 'ita':
        return 'it'
    return record.lang

reader = LocalReader('europeana_test')
mapping = ('parallel', ('copy', 'id', 'id'),
                       ('sequence', ('copy', 'lang', 'dcLanguage[0]'), ('custom', 'lang', map_language)),
                       ('custom', 'title_tokens', lambda record: len(record.title[0].split())),
                       ('copy', 'completeness', 'europeanaCompleteness'),
                       ('copy', 'type', 'type'))
transformed = RecordsTransform(reader, mapping)
fltr = ('and', ('exists', ['lang']),
               ('neq', ['lang'], 'mul'),
               ('neq', ['lang'], 'zxx'),
               ('neq', ['lang'], 'und'),
               ('neq', ['lang'], 'la'),
               ('neq', ['lang'], 'sv'),
               ('neq', ['lang'], 'es'),
               ('neq', ['lang'], 'da'),
               ('neq', ['lang'], 'nl'),
               ('neq', ['lang'], 'ca'),
               ('neq', ['lang'], 'it'),
               ('neq', ['lang'], 'et'),
               ('neq', ['lang'], 'en'),
               ('lt', ['title_tokens'], 30))
filtered = RecordsFilter(transformed, fltr)
df = PandasDFWriter().write(filtered)

In [9]:
from polymatheia.data.writer import LocalWriter
writer = LocalWriter('europeana_clean', 'id')
writer.write(filtered)


In [11]:
from polymatheia.data.writer import LocalWriter, CSVWriter
writer = LocalWriter('europeana_clean', 'id')
writer.write(filtered)

writer = CSVWriter('europeana_clean.csv')
writer.write(filtered)

In [12]:
Counter(df['lang'])

Counter({'de': 677, 'hu': 19, 'pl': 12, 'fr': 10})

In [13]:
de_title_lengths = df[df['lang'] == 'de']['title_tokens']
hu_title_lengths = df[df['lang'] == 'hu']['title_tokens']
pl_title_lengths = df[df['lang'] == 'pl']['title_tokens']
fr_title_lengths = df[df['lang'] == 'fr']['title_tokens']

In [14]:
stats.mannwhitneyu(de_title_lengths, hu_title_lengths, alternative='two-sided')

MannwhitneyuResult(statistic=4533.0, pvalue=0.027652701983659003)

In [17]:
de_title_lengths.describe()

count    677.000000
mean       8.918759
std        5.544672
min        1.000000
25%        5.000000
50%        8.000000
75%       11.000000
max       29.000000
Name: title_tokens, dtype: float64

In [18]:
hu_title_lengths.describe()

count    19.000000
mean     11.263158
std       6.279084
min       1.000000
25%       8.500000
50%      13.000000
75%      15.000000
max      26.000000
Name: title_tokens, dtype: float64

In [19]:
df2 = DataFrame([{'text': 663, 'image': 12, 'sound': 5},
                 {'text': 19, 'image': 0, 'sound': 0},
                 {'text': 12, 'image': 0, 'sound': 0},
                 {'text': 10, 'image': 0, 'sound': 0}])

In [20]:
df2

Unnamed: 0,text,image,sound
0,663,12,5
1,19,0,0
2,12,0,0
3,10,0,0


In [21]:
stats.chi2_contingency(df2)

(1.049751420454546,
 0.983655785107175,
 6,
 array([[6.63966713e+02, 1.13176144e+01, 4.71567268e+00],
        [1.85520111e+01, 3.16227462e-01, 1.31761442e-01],
        [1.17170596e+01, 1.99722607e-01, 8.32177531e-02],
        [9.76421637e+00, 1.66435506e-01, 6.93481276e-02]]))