# Statistics

In [19]:
import json
import os

from collections import Counter, defaultdict, namedtuple
from itertools import groupby
from multiprocessing import cpu_count
from operator import itemgetter
from string import punctuation

import matplotlib.pyplot as plt

from conllu.parser import parse as conllu_parse
from tqdm import tqdm

%matplotlib inline

## Number of sentences (after filtering)

In [2]:
sents = 0
path = 'aligned_sents/'
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        sents += len(json.load(f))
print(sents)

100%|██████████| 30883/30883 [00:31<00:00, 979.78it/s]

23062193





## Number of tokens

In [3]:
en_tokens = 0
he_tokens = 0
errors = 0
path = 'fastalign/'
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        for line in f:
            try:
                en, he = line.strip().split(' ||| ')
                en_tokens += len(en.strip().split())
                he_tokens += len(he.strip().split())
            except:
                errors += 1
print('Errors:', errors)
print('English tokens:', en_tokens)
print('Hebrew tokens:', he_tokens)

100%|██████████| 30883/30883 [01:59<00:00, 259.23it/s]

Errors: 1945
English tokens: 194217249
Hebrew tokens: 188375525





## Average length of English sentences

In [4]:
sents = 23062193
lengths = 0
path = 'aligned_sents/'
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        for sent in json.load(f):
            lengths += len(sent['en'].strip().split())
print(lengths / sents)

100%|██████████| 30883/30883 [01:02<00:00, 491.94it/s]

6.452778233188838





## Average length of Hebrew sentences - before segmentation

In [5]:
sents = 23062193
lengths = 0
path = 'aligned_sents/'
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        for sent in json.load(f):
            lengths += len(sent['he'].strip().split())
print(lengths / sents)

100%|██████████| 30883/30883 [01:03<00:00, 486.29it/s]

5.126847477167501





## Average length of Hebrew sentences - after segmentation

In [6]:
sents = 23062193
lengths = 0
path = 'fastalign/'
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        for line in f:
            _, he = line.split(' ||| ')
            lengths += len(he.strip().split())
print(lengths / sents)

100%|██████████| 30883/30883 [01:20<00:00, 381.54it/s]

8.168153176066127





## Number of English tokens/types

In [7]:
tokens = []
path = 'aligned_sents/'
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        for sent in json.load(f):
            tokens.extend(sent['en'].strip().split())
print('Tokens:', len(tokens))
print('Types:', len(set(tokens)))

100%|██████████| 30883/30883 [01:10<00:00, 439.73it/s]


Tokens: 148815217
Types: 1540672


## Number of Hebrew tokens/types - before segmentation

In [8]:
tokens = []
path = 'aligned_sents/'
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        for sent in json.load(f):
            tokens.extend(sent['he'].strip().split())
print('Tokens:', len(tokens))
print('Types:', len(set(tokens)))

100%|██████████| 30883/30883 [01:06<00:00, 467.75it/s]


Tokens: 118236346
Types: 2468583


## Number of Hebrew tokens/types - after segmentation

In [9]:
tokens = []
path = 'fastalign/'
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        for line in f:
            _, he = line.split(' ||| ')
            tokens.extend(he.strip().split())
print('Tokens:', len(tokens))
print('Types:', len(set(tokens)))

100%|██████████| 30883/30883 [01:25<00:00, 363.08it/s]


Tokens: 188375525
Types: 894759


In [10]:
c = Counter(tokens)
len([(word, count) for word, count in c.items() if count == 1])

570142

In [11]:
Node = namedtuple('Node', ['token', 'distance'])


def parse_tree_depth(sentence):
    root = [t for t in sentence if t['head'] == 0][0]
    root = Node(root, 0)
    visited, stack = [], [root]
    while stack:
        node = stack.pop()
        if node not in visited:
            visited.append(node)
            stack.extend([Node(t, node.distance + 1) for t in sentence if t['head'] == node.token['id']])
    return max(visited, key=itemgetter(1)).distance

## Average Hebrew parse tree depth

In [12]:
path = 'hebrew_parsed/'
sents_count = 23062193
depths_count = 0
for h in tqdm(os.listdir(path)):
    with open(os.path.join(path, h), encoding='utf-8') as f:
        sents = conllu_parse(f.read())
        for sent in sents:
            depths_count += parse_tree_depth(sent)
print(depths_count / sents_count)

100%|██████████| 30789/30789 [4:58:36<00:00,  1.72it/s]  

2.956006308680185





## Average English parse tree depth

In [13]:
path = 'english_parsed/'
sents_count = 23062193
depths_count = 0
for h in tqdm(os.listdir(path)):
    with open(os.path.join(path, h), encoding='utf-8') as f:
        sents = conllu_parse(f.read())
        for sent in sents:
            depths_count += parse_tree_depth(sent)
print(depths_count / sents_count)

100%|██████████| 30789/30789 [3:48:33<00:00,  2.25it/s]  

2.441202621103726





## English-Hebrew ratio - before segmentation

In [14]:
path = 'aligned_sents/'
ratios = 0
sents = 23062193
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        for sent in json.load(f):
            en = len(sent['en'].strip().split())
            he = len(sent['he'].strip().split())
            if en and he:
                ratios += en / he
print(ratios / sents)

100%|██████████| 30883/30883 [01:43<00:00, 297.93it/s]

1.328704602746318





## English-Hebrew ratio - after segmentation

In [15]:
path = 'fastalign/'
ratios = 0
errors = 0
sents = 23062193
for sub in tqdm(os.listdir(path)):
    with open(os.path.join(path, sub), encoding='utf-8') as f:
        for line in f:
            try:
                en, he = line.strip().split(' ||| ')
                en = len(en.strip().split())
                he = len(he.strip().split())
                if en and he:
                    ratios += en / he
            except:
                errors += 1
print('Errors:', errors)
print(ratios / sents)

100%|██████████| 30883/30883 [02:07<00:00, 242.66it/s]

Errors: 1945
1.1205079794412722





## Average number of Hebrew words aligned to English

In [16]:
path = 'fastalign_outputs/'
en_words = 0
he_words = 0
for f_alignment in tqdm(os.listdir(path)):
    with open(os.path.join(path, f_alignment), encoding='utf-8') as f:
        for line in f:
            pairs = line.strip().split()
            pairs = sorted([tuple(map(int, p.split('-'))) for p in pairs])
            for _, g in groupby(pairs, key=itemgetter(0)):
                en_words += 1
                he_words += len(list(g))
print(he_words / en_words)

100%|██████████| 30789/30789 [13:07<00:00, 39.08it/s]

1.2506758527707749





## Number of English-Hebrew word pairs

In [17]:
path = 'fastalign_outputs/'
pairs = 0
for f_alignment in tqdm(os.listdir(path)):
    with open(os.path.join(path, f_alignment), encoding='utf-8') as f:
        for line in f:
            pairs += len(line.strip().split())
            
print(pairs)

100%|██████████| 30789/30789 [00:36<00:00, 840.08it/s]

181765009





## fast_align Pair Statistics

In [18]:
path = 'fastalign_outputs/'
one_to_zero = 0
one_to_one = 0
one_to_many = 0
for f_alignment in tqdm(os.listdir(path)):
    with open(os.path.join(path, f_alignment), encoding='utf-8') as f:
        for line in f:
            pairs = line.strip().split()
            pairs = sorted([tuple(map(int, p.split('-'))) for p in pairs])
            for _, g in groupby(pairs, key=itemgetter(0)):
                length = len(list(g))
                if length == 0:
                    one_to_zero += 1
                elif length == 1:
                    one_to_one += 1
                else:
                    one_to_many += 1
print('1-0:', one_to_zero)
print('1-1:', one_to_one)
print('1-n:', one_to_many)

100%|██████████| 30789/30789 [13:30<00:00, 38.00it/s]

1-0: 0
1-1: 115554782
1-n: 29778646





In [28]:
path1 = 'fastalign_outputs/'  # has less files
path2 = 'fastalign/'
errors = 0
word_pairs = Counter()
for sub in tqdm(os.listdir(path1)):
    with open(os.path.join(path1, sub), encoding='utf-8') as f1, open(os.path.join(path2, sub), encoding='utf-8') as f2:
        for line1, line2 in zip(f1, f2):
            pairs = line1.strip().split()
            pairs = sorted([tuple(map(int, p.split('-'))) for p in pairs])
            try:
                en_sent, he_sent = line2.strip().split(' ||| ')
                en_words = en_sent.split()
                he_words = he_sent.split()
                for i, js in groupby(pairs, key=itemgetter(0)):
                    js = list(js)
                    if len(js) == 1:
                        j = js[0][1]
                        word_pairs.update([(en_words[i].lower(), he_words[j].lower())])
            except:
                errors += 1

100%|██████████| 30789/30789 [35:38<00:00, 14.39it/s]


In [29]:
len(word_pairs)

7530172

In [30]:
def ispunct(s):
    return all([c in punctuation for c in s])

keys = list(word_pairs)
for k in keys:
    if ispunct(k[0]):
        word_pairs.pop(k)

In [31]:
len(word_pairs)

7443214

In [23]:
word_pairs.most_common(100)

[(('i', 'אני'), 2946217),
 (('the', 'ה'), 2148227),
 (('you', 'אתה'), 2086108),
 (('and', 'ו'), 1565287),
 (("n't", 'לא'), 1212902),
 (('it', 'זה'), 1119071),
 (('what', 'מה'), 1013666),
 (('me', 'אני'), 790374),
 (('he', 'הוא'), 787565),
 (('you', 'את'), 695642),
 (('not', 'לא'), 687369),
 (('to', 'ל'), 680021),
 (('that', 'זה'), 640604),
 (('in', 'ב'), 622867),
 (('but', 'אבל'), 550264),
 (('no', 'לא'), 535962),
 (('we', 'אנחנו'), 516574),
 (('this', 'זה'), 450298),
 (('she', 'היא'), 417537),
 (('if', 'אם'), 413350),
 (('yeah', 'כן'), 382135),
 (('that', 'ש'), 341983),
 (('they', 'הם'), 335973),
 (('know', 'יודע'), 331036),
 (('with', 'עם'), 312677),
 (('him', 'הוא'), 278949),
 (('here', 'כאן'), 275220),
 (('so', 'אז'), 274284),
 (('to', 'את'), 272781),
 (('of', 'של'), 260677),
 (("'s", 'ה'), 260029),
 (('was', 'היה'), 257920),
 (('now', 'עכשיו'), 251554),
 (("'m", 'אני'), 250630),
 (('want', 'רוצה'), 237890),
 (('for', 'ל'), 222461),
 (('her', 'היא'), 214870),
 (('just', 'רק'), 2138

## Hebrew POS-tag distribution

In [24]:
he_postags = Counter()
for file in tqdm(os.listdir('hebrew_parsed/')):
    with open(os.path.join('hebrew_parsed/', file), encoding='utf-8') as f:
        he_postags.update([token['upostag'] for sent in conllu_parse(f.read()) for token in sent])
he_postags.most_common()

100%|██████████| 30789/30789 [5:02:40<00:00,  1.70it/s]  


[('NN', 22547152),
 ('yyDOT', 16887215),
 ('PRP', 13941874),
 ('VB', 13597524),
 ('RB', 12583894),
 ('DEF', 11563647),
 ('yyCM', 8760079),
 ('IN', 8416983),
 ('PREPOSITION', 8367178),
 ('S_PRN', 7772158),
 ('BN', 7747940),
 ('NNP', 6063453),
 ('JJ', 5936312),
 ('AT', 5147419),
 ('yyQM', 4641169),
 ('REL', 4569239),
 ('NNT', 3707301),
 ('QW', 2993666),
 ('CONJ', 2512398),
 ('CC', 2496745),
 ('POS', 2427657),
 ('COP', 2281502),
 ('MD', 2076368),
 ('DTT', 1722460),
 ('yyEXCL', 1630825),
 ('yyELPS', 1576157),
 ('CD', 1409611),
 ('EX', 1102323),
 ('INTJ', 999082),
 ('yyQUOT', 937301),
 ('TEMP', 256881),
 ('DT', 239988),
 ('TTL', 156193),
 ('CDT', 125870),
 ('yyCLN', 114823),
 ('P', 104586),
 ('JJT', 95604),
 ('yyRRB', 68302),
 ('NCD', 66348),
 ('yyLRB', 65762),
 ('BNT', 61648),
 ('ADVERB', 17573),
 ('yySCLN', 9912),
 ('NEG', 4606),
 ('NNPT', 85)]

## English POS-tag distribution

In [25]:
en_postags = Counter()
for file in tqdm(os.listdir('english_parsed/')):
    with open(os.path.join('english_parsed/', file), encoding='utf-8') as f:
        en_postags.update([token['upostag'] for sent in conllu_parse(f.read()) for token in sent])
en_postags.most_common()

100%|██████████| 30789/30789 [3:44:38<00:00,  2.28it/s]  


[('.', 22516525),
 ('PRP', 22277895),
 ('NN', 18757022),
 ('RB', 12305224),
 ('DT', 12266974),
 ('IN', 11678617),
 ('NNP', 10205478),
 ('VB', 10181014),
 (',', 9574190),
 ('VBP', 9174820),
 ('JJ', 6778558),
 ('VBZ', 5525697),
 ('VBD', 5140162),
 ('TO', 4127560),
 ('NNS', 4049763),
 ('VBG', 3274947),
 ('CC', 3248620),
 ('MD', 3129102),
 ('PRP$', 3091388),
 ('VBN', 2164050),
 ('UH', 2074193),
 ('WP', 1901387),
 (':', 1635039),
 ('WRB', 1419453),
 ('CD', 1340267),
 ('RP', 840481),
 ('POS', 667730),
 ('WDT', 574816),
 ('``', 453748),
 ('EX', 363680),
 ("''", 345117),
 ('JJR', 297976),
 ('JJS', 185180),
 ('PDT', 159843),
 ('RBR', 134683),
 ('-RRB-', 106620),
 ('-LRB-', 106609),
 ('NNPS', 101716),
 ('FW', 54365),
 ('#', 53198),
 ('$', 35715),
 ('RBS', 29242),
 ('WP$', 6547),
 ('LS', 6258),
 ('SYM', 50)]

In [27]:
targets = Counter()
sents_count = 0
for f in tqdm(os.listdir('english_srl/')):
    with open(os.path.join('english_srl/', f), encoding='utf-8') as j:
        for line in j:
            frames = json.loads(line)['frames']
            sents_count += 1
            for frame in frames:
                targets.update([frame['target']['name']])

100%|██████████| 30789/30789 [20:37<00:00, 24.87it/s]


In [28]:
print('Average #frames/sentence:', sum(targets.values()) / sents_count)
print('Number of frames (targets):', sum(targets.values()))
print('Number of unique targets:', len(targets.keys()))
print('20 most common targets:')
targets.most_common(20)

Average #frames/sentence: 2.3994585422700814
Number of frames (targets): 55246362
Number of unique targets: 784
20 most common targets:


[('Intentionally_act', 2922780),
 ('Locative_relation', 2124853),
 ('Quantity', 1706189),
 ('Arriving', 1696957),
 ('Being_obligated', 1198687),
 ('Desirability', 1190368),
 ('Statement', 1161159),
 ('Capability', 1156961),
 ('Temporal_collocation', 1116567),
 ('Cardinal_numbers', 1080994),
 ('People', 1054872),
 ('Calendric_unit', 862375),
 ('Causation', 837578),
 ('Kinship', 803670),
 ('Certainty', 796327),
 ('Desiring', 713872),
 ('Relational_quantity', 665392),
 ('Awareness', 660211),
 ('Observable_body_parts', 627344),
 ('Becoming', 606179)]

In [29]:
fes = Counter()
sents_count = 0
for f in tqdm(os.listdir('english_srl/')):
    with open(os.path.join('english_srl/', f), encoding='utf-8') as j:
        for line in j:
            frames = json.loads(line)['frames']
            sents_count += 1
            for frame in frames:
                for fe in frame['annotationSets'][0]['frameElements']:
                    fes.update([fe['name']])

100%|██████████| 30789/30789 [21:54<00:00, 23.42it/s]


In [30]:
print('Average #FEs/sentence:', sum(fes.values()) / sents_count)
print('Number of FEs:', sum(fes.values()))
print('Number of unique FEs:', len(fes.keys()))
print('20 most common FEs:')
fes.most_common(20)

Average #FEs/sentence: 2.8115148325402077
Number of FEs: 64733757
Number of unique FEs: 661
20 most common FEs:


[('Entity', 3891862),
 ('Agent', 3299307),
 ('Theme', 3048768),
 ('Cognizer', 2920899),
 ('Event', 2618780),
 ('Act', 1967112),
 ('Quantity', 1699100),
 ('Experiencer', 1677843),
 ('Goal', 1668385),
 ('Speaker', 1557612),
 ('Content', 1537237),
 ('Person', 1406459),
 ('Evaluee', 1404153),
 ('Message', 1270952),
 ('Ground', 1121609),
 ('Unit', 1094743),
 ('Responsible_party', 1079516),
 ('Number', 1069190),
 ('Phenomenon', 1009228),
 ('Locale', 816558)]

In [31]:
en_words = Counter()
en_postags = Counter()
for file in tqdm(os.listdir('english_parsed/')):
    with open(os.path.join('english_parsed/', file), encoding='utf-8') as f:
        sents = conllu_parse(f.read())
        for sent in sents:
            for token in sent:
                en_words.update([token['form']])
                en_postags.update([token['upostag']])

100%|██████████| 30789/30789 [4:23:21<00:00,  1.95it/s]  


In [32]:
print('Number of English words:', sum(en_words.values()))
print('Number of unique English words:', len(en_words.keys()))
print('20 most common English words:')
en_words.most_common(20)

Number of English words: 192361519
Number of unique English words: 713666
20 most common English words:


[('.', 15987895),
 (',', 9525720),
 ('I', 6030592),
 ('you', 4771733),
 ('?', 4675250),
 ('the', 4092004),
 ('to', 3576869),
 ("'s", 3249261),
 ('a', 2923023),
 ("n't", 2076183),
 ('it', 1988100),
 ('that', 1742936),
 ('!', 1734606),
 ('of', 1718093),
 ('You', 1588832),
 ('and', 1534830),
 ('do', 1510007),
 ('...', 1423630),
 ('is', 1418116),
 ('in', 1396204)]

In [33]:
print('Most common English POS tags:')
en_postags.most_common()

Most common English POS tags:


[('.', 22516525),
 ('PRP', 22277895),
 ('NN', 18757022),
 ('RB', 12305224),
 ('DT', 12266974),
 ('IN', 11678617),
 ('NNP', 10205478),
 ('VB', 10181014),
 (',', 9574190),
 ('VBP', 9174820),
 ('JJ', 6778558),
 ('VBZ', 5525697),
 ('VBD', 5140162),
 ('TO', 4127560),
 ('NNS', 4049763),
 ('VBG', 3274947),
 ('CC', 3248620),
 ('MD', 3129102),
 ('PRP$', 3091388),
 ('VBN', 2164050),
 ('UH', 2074193),
 ('WP', 1901387),
 (':', 1635039),
 ('WRB', 1419453),
 ('CD', 1340267),
 ('RP', 840481),
 ('POS', 667730),
 ('WDT', 574816),
 ('``', 453748),
 ('EX', 363680),
 ("''", 345117),
 ('JJR', 297976),
 ('JJS', 185180),
 ('PDT', 159843),
 ('RBR', 134683),
 ('-RRB-', 106620),
 ('-LRB-', 106609),
 ('NNPS', 101716),
 ('FW', 54365),
 ('#', 53198),
 ('$', 35715),
 ('RBS', 29242),
 ('WP$', 6547),
 ('LS', 6258),
 ('SYM', 50)]