In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from tqdm import tqdm
import glob

from entities.mng import ManagingCompany
from entities.founder import Founder
from entities.succession import Succession
from entities.orgchief import OrgChief

from csv_processing.data import load_data_sources, load_ref_tables

import csv_processing.utils as utils

from tqdm import tqdm

import gzip
import json 

In [3]:
# Use Prefix '_2022-04' to load only a small subset of data (<1%)
# Dont use Any prefix if you want full data. (May take a long time to load)
FOLDER = '../data1/'
PREFIX = "_2022-04"

In [4]:
SOURCE_DICT = load_data_sources(FOLDER, PREFIX)
REF_DICT = load_ref_tables(FOLDER)

In [6]:
companies = utils.preprocess_company(SOURCE_DICT['companies'])
persons = utils.preprocess_person(SOURCE_DICT['persons'])

In [7]:
mng = utils.preprocess_mng(SOURCE_DICT['mng'])
founder = utils.preprocess_founder(SOURCE_DICT['founder'])
org_chief = utils.preprocess_org_chief(SOURCE_DICT['org_chief'])
predecessor = utils.preprocess_sucession(SOURCE_DICT['predecessor'])

In [8]:

def generate_enitites(df, entity_func, **kwargs):
    entities = []
    for i, row in tqdm(df.iterrows(), total=len(df)):
        entities += entity_func(row, **kwargs)
    return entities

In [9]:
orgs = generate_enitites(companies, utils.create_org)

100%|███████████████████████████████████████████████████████████████████████████| 19435/19435 [00:31<00:00, 621.13it/s]


In [10]:
people = generate_enitites(persons, utils.create_person)

100%|██████████████████████████████████████████████████████████████████████████| 24529/24529 [00:14<00:00, 1651.83it/s]


In [11]:
mngrs = generate_enitites(mng, utils.create_connection, connType=ManagingCompany)

100%|██████████████████████████████████████████████████████████████████████████████| 179/179 [00:00<00:00, 5096.81it/s]


In [12]:
chiefs = generate_enitites(org_chief, utils.create_connection, connType=OrgChief)

100%|██████████████████████████████████████████████████████████████████████████| 19389/19389 [00:02<00:00, 7110.51it/s]


In [13]:
founders = generate_enitites(founder, utils.create_connection, connType=Founder)

100%|██████████████████████████████████████████████████████████████████████████| 27605/27605 [00:04<00:00, 6406.30it/s]


In [14]:
successions = generate_enitites(predecessor, utils.create_connection, connType=Succession)

100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 3961.38it/s]


In [25]:
data = orgs + people + mngrs + chiefs + founders + successions

In [26]:
items = [i.to_dict() for i in data]

In [30]:
jsonfilename = 'ru-egrul-small.json.gzip'

In [31]:
# WRITE
json_str = json.dumps(items) + "\n"               
json_bytes = json_str.encode('utf-8')            

with gzip.open(jsonfilename, 'w') as fout:    
    fout.write(json_bytes)     

In [32]:
# READ
with gzip.open(jsonfilename, 'r') as fin:        
    json_bytes = fin.read()                      

json_str = json_bytes.decode('utf-8')            
data = json.loads(json_str)                     