In [1]:
from os import listdir, makedirs
from os.path import isfile, isdir, join
from constants import DATA_BASE, ETL_PATH, NLP_PATH, SMPL_PATH, HASH
import pandas as pd
import gc
import re
import numpy as np
import json
pd.options.display.max_rows = 2001

In [2]:
ignore = ['good', 'categories', 'hashmap', 'links', 'meta', 'phrases', 'sample', 'cache']
re_ignore = r'.*?(' + '|'.join(ignore) + r').*?'
ignore_pattern = re.compile(re_ignore)

datasets = ['Europarl', 'FAZ_combined', 'FOCUS_cleansed', 'OnlineParticipation', 'PoliticalSpeeches', 'dewiki', 'dewac']
re_include = r'^(' + '|'.join(datasets) + r').*'
include_pattern = re.compile(re_include)

In [None]:
# statistics for the ETL pipeline
stats = []
path = ETL_PATH
files = sorted([f for f in listdir(path)
                    if (
                        isfile(join(path, f))
                        and include_pattern.match(f)
                        and not ignore_pattern.search(f)
                       )
               ], key=lambda x: x.lower())

for name in files:
    gc.collect()
    full_path = join(path, name)
    if not isfile(full_path):
        continue

    print('reading', name)
    corpus = re.split(r'_.', name)[0]
    df = pd.read_pickle(join(path, name))
    stats.append((corpus, path, len(df), None, None, int(df.index[0]), df.title.iloc[0]))
    
with open(join(ETL_PATH, 'stats_etl.json'), 'w') as fp:
    json.dump(stats, fp, ensure_ascii=False)

In [None]:
# statistics for the NLP pipeline
stats = []
path = NLP_PATH
files = sorted([f for f in listdir(path)
                    if (
                        isfile(join(path, f))
                        and include_pattern.match(f)
                        and not ignore_pattern.search(f)
                       )
               ], key=lambda x: x.lower())

for name in files:
    gc.collect()
    full_path = join(path, name)
    if not isfile(full_path):
        continue

    print('reading', name)
    corpus = re.split(r'_.', name)[0]
    df = pd.read_pickle(join(path, name))
    stats.append((corpus, path, len(np.unique(df.hash.values)), len(np.unique(df.sent_idx.values)), len(df), int(df.hash.iloc[0]), df.token.iloc[0]))
    gc.collect()
    
with open(join(ETL_PATH, 'stats_nlp.json'), 'w') as fp:
    json.dump(stats, fp, ensure_ascii=False)

In [None]:
# statistics for the phrase extraction pipeline
stats = []
path = SMPL_PATH
files = [f for f in listdir(path) 
         if include_pattern.match(f) and not ignore_pattern.search(f)
        ]

# include dewiki subdir
for name in files:
    full_path = join(path, name)
    if isdir(full_path):
        subdir = [join(name, f) for f in listdir(full_path)
                 if include_pattern.match(f) and not ignore_pattern.search(f)]
        files += subdir
        
for name in sorted(files, key=lambda x: x.lower()):
    gc.collect()
    full_path = join(path, name)
    if not isfile(full_path):
        continue

    print('reading', name)
    df = pd.read_pickle(full_path)
    corpus = re.split(r'_.', name)[0]
    doc_stats = (corpus, path, len(np.unique(df.hash.values)), len(np.unique(df.sent_idx.values)), len(df), int(df.hash.iloc[0]), df.token.iloc[0])
    print(doc_stats)
    stats.append(doc_stats)
    gc.collect()
    
with open(join(ETL_PATH, 'stats_smpl.json'), 'w') as fp:
    json.dump(stats, fp, ensure_ascii=False)

In [None]:
# statistics for the phrase extraction pipeline pt2. (including Wikipedia phrases)
stats = []
path = join(SMPL_PATH, 'wiki_phrases')
files = listdir(path)

for name in sorted(files, key=lambda x: x.lower()):
    gc.collect()
    full_path = join(path, name)
    if not isfile(full_path):
        continue

    print('reading', name)
    df = pd.read_pickle(full_path)
    corpus = re.split(r'_.', name)[0]
    doc_stats = (corpus, path, len(np.unique(df.hash.values)), len(np.unique(df.sent_idx.values)), len(df), int(df.hash.iloc[0]), df.token.iloc[0])
    print(doc_stats)
    stats.append(doc_stats)
    gc.collect()
    
with open(join(ETL_PATH, 'stats_wiki_phrases.json'), 'w') as fp:
    json.dump(stats, fp, ensure_ascii=False)

-----

In [3]:
# aggregating statistics
path = ETL_PATH
files = sorted([f for f in listdir(path) if re.match(r'stats.*?json$', f)])
stats = []
for name in files:
    full_path = join(path, name)
    with open(join(ETL_PATH, name), 'r') as fp:
        stats += json.load(fp)

df = pd.DataFrame.from_records(stats, columns=['file', 'path', 'nb_docs', 'nb_sents', 'nb_words', 'doc_hash', 'first_token'])
df['corpus'] = df.file.map(lambda x: re.split(r'[_./]', x)[0])
dfx = df.groupby(['corpus', 'path']).sum().drop('doc_hash', axis=1)
dfx[['nb_docs', 'nb_sents', 'nb_words']] = dfx[['nb_docs', 'nb_sents', 'nb_words']].astype('int64')
dfx.to_csv(join(ETL_PATH, 'stats.csv'))
dfx

Unnamed: 0_level_0,Unnamed: 1_level_0,nb_docs,nb_sents,nb_words
corpus,path,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Europarl,../data/preprocessed,12788,0,0
Europarl,../data/preprocessed/nlp,12788,2575653,56247967
Europarl,../data/preprocessed/simple,12788,2575653,54883666
Europarl,../data/preprocessed/simple/wiki_phrases,12788,2571547,54747950
FAZ,../data/preprocessed,49758,0,0
FAZ,../data/preprocessed/nlp,49758,1552118,27029604
FAZ,../data/preprocessed/simple,49758,1552118,26386078
FAZ,../data/preprocessed/simple/wiki_phrases,49758,1551960,26248649
FOCUS,../data/preprocessed,86158,0,0
FOCUS,../data/preprocessed/nlp,86158,1667720,25488267
