# Data Statistics

In [1]:
import pathlib
import json
from itertools import chain

from hazm import stopwords_list, Normalizer, word_tokenize


In [2]:
DATA_PATH = './outputs/'

## Utils

In [3]:
def _handle_dict(dictionary: dict, text_list: list):
    for key, value in dictionary.items():
        if key in {
            'images',
            'book_source',
            'wikipedia_source',
        }:
            continue

        if isinstance(value, str):
            text_list.append(value)
        elif isinstance(value, dict):
            _handle_dict(value, text_list)
        elif isinstance(value, list):
            _handle_list(value, text_list)
        else:
            raise NotImplementedError()

def _handle_list(input_list: list, text_list: list):
    for x in input_list:
        if isinstance(x, str):
            text_list.append(x)
        elif isinstance(x, dict):
            _handle_dict(x, text_list)
        elif isinstance(x, list):
            _handle_list(x, text_list)
        else:
            raise NotImplementedError()

def extract_text(dictionary: dict):
    text_list = []
    _handle_dict(dictionary, text_list)
    return text_list

## Calculating Statistics

In [4]:
data_path = pathlib.Path(DATA_PATH)
stopwords = stopwords_list() 

In [5]:
sentences = list(chain(*[extract_text(json.load(open(path, 'r'))) for path in data_path.rglob('*.json')]))
raw_text = '\n'.join(sentences)

In [6]:
normalizer = Normalizer()
normalized_text = normalizer.normalize(raw_text)
tokens = word_tokenize(normalized_text)
filtered_tokens = [w for w in tokens if w not in stopwords_list()]

In [7]:
print(
    'number of records in dataset: {}'
    .format(len(list(data_path.rglob('*.json'))))
)

number of records in dataset: 113


In [8]:
print('words count: {}'.format(len(filtered_tokens)))

words count: 17623


In [9]:
print('unique words: {}'.format(len(set(filtered_tokens))))

unique words: 2626


In [10]:
print('average length of sentences: {}'.format(sum(map(lambda x: len(x.split()), sentences)) / len(sentences)))

average length of sentences: 2.9677971375233354
