In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from tqdm import tqdm
import glob

from entities.organisation import Organisation
from entities.address import Address

from entities.individual_enterpreneur import IndividualEnterpereneur
from entities.person import Person

from entities.mng import ManagingCompany
from entities.founder import Founder
from entities.succession import Succession
from entities.orgchief import OrgChief

from utils.column_mapping import COMPANY_MAPPING, PERSONS_MAPPING, IE_MAPPING, LOCATION_MAPPING

from csv_processing.utils import bytes_to_date, fix_inn
from csv_processing.data import load_data_sources, load_ref_tables

In [3]:
# Use Prefix '_2022-04' to load only a small subset of data (<1%)

FOLDER = '../data/'
PREFIX = '_2022-04'

In [4]:
SOURCE_DICT = load_data_sources(FOLDER)

In [5]:
REF_DICT = load_ref_tables(FOLDER)

In [6]:
# Just for easy access

mng = SOURCE_DICT['mng']
founder = SOURCE_DICT['founder']
org_chief = SOURCE_DICT['org_chief']
predecessor = SOURCE_DICT['predecessor']
companies = SOURCE_DICT['companies']
persons = SOURCE_DICT['persons']

In [7]:
# Keep only latest updated value
companies = companies.sort_values('max_num', ascending=True).drop_duplicates('ogrn', keep='last')
persons = persons.sort_values('updated_at_num', ascending=True).drop_duplicates('inn', keep='last')

In [8]:
# Companies pre-processing
companies['opf_id'] = companies['opf_id'].apply(REF_DICT['OPF'].get)
companies['okved_id'] = companies['okved_id'].apply(REF_DICT['KVED'].get)
companies['jurisdiction'] = 'russia'
companies[COMPANY_MAPPING.keys()].head()

Unnamed: 0,ogrn,reg_date,end_date,opf_id,okved_id,inn,kpp,full_name,email,pfr,fss,capital,jurisdiction
2601,1032600320103,2003-01-05,0000-00-00,20107.0,47.1.0 Торговля розничная в неспециализированн...,2619008320,261901001,РАЙОННОЕ ПОТРЕБИТЕЛЬСКОЕ ОБЩЕСТВО,,36019000110,261100054326041,0,russia
9092,1145749008404,2014-09-18,0000-00-00,12300.0,38.1.0 Сбор отходов,5753203739,572001001,ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ &quot...,,67022105738,570001967357001,16160000,russia
12719,1192651006041,2019-03-27,0000-00-00,12300.0,47.25.0 Торговля розничная напитками в специал...,2630050389,263001001,ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ &quot...,,36030111876,261900636026071,200000,russia
14791,1213700001020,2021-01-27,0000-00-00,12300.0,46.90.0 Торговля оптовая неспециализированная,3702254266,370201001,ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ &quot...,BAZOV.SHURA@INBOX.RU,47024086717,370000461837001,10000,russia
11843,1182375081525,2018-09-27,0000-00-00,12300.0,49.41.0 Деятельность автомобильного грузового ...,2311276693,231101001,ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ &quot...,,33016133970,230103608823011,20000,russia


In [9]:
# Persons pre-processing
persons['country_code'] = persons['country_code'].apply(REF_DICT['COUNTRY'].get)
# Split persons to Individual Enterpreneurs and physical persons
people = persons[persons['ogrnip'] == 0]
individ_ent = persons[persons['ogrnip'] != 0]

# Generate Entities

In [10]:
def create_org(row):
    org = Organisation(row)
    address = Address(row)
    org.set_property('address', address.to_ftm())
    return [org.to_ftm(), address.to_ftm()]

def create_person(row):
    person = Person(row)
    return [person.to_ftm()]

def create_ie(row):
    ie = IndividualEnterpereneur(row)
    return [ie.to_ftm()]

def create_connection(row, connType=ManagingCompany):
    connection = connType(row)
    return [connection.to_ftm()]

In [17]:
from tqdm import tqdm
def generate_enitites(df, entity_func, **kwargs):
    entities = []
    for i, row in tqdm(df.iterrows(), total=len(df)):
        entities += entity_func(row, **kwargs)
    return entities

In [14]:
orgs = generate_enitites(companies, create_org)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 19435/19435 [00:56<00:00, 341.14it/s]


In [18]:
ppeople = generate_enitites(people, create_person)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1025/1025 [00:01<00:00, 790.67it/s]


In [16]:
enterpreneurs = generate_enitites(individ_ent, create_ie)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 23504/23504 [00:24<00:00, 941.27it/s]


In [22]:
mngrs = generate_enitites(mng, create_connection, connType=ManagingCompany)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 179/179 [00:00<00:00, 2449.38it/s]


In [29]:
founders = generate_enitites(founder.astype(int), create_connection, connType=Founder)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 27605/27605 [00:09<00:00, 3006.85it/s]


In [32]:
chiefs = generate_enitites(org_chief, create_connection, connType=OrgChief)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 19389/19389 [00:03<00:00, 5466.98it/s]


In [34]:
successions = generate_enitites(predecessor, create_connection, connType=Succession)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 2191.15it/s]


# Upload to Aleph

In [36]:
data = orgs + ppeople + enterpreneurs + mngrs + founders + chiefs + successions

In [None]:
# Set credentials
ALEPHHOST = os.environ['ALEPHHOST']
APIKEY = os.environ['APIKEY']
COLLECTION_ID = ""

In [None]:
from alephclient.api import AlephAPI
api = AlephAPI(host=ALEPHHOST, api_key=APIKEY)

In [None]:
# Format to JSON (Dict)
json_ent = [i.to_dict() for i in data]

In [None]:
api.write_entities(COLLECTION_ID, json_ent)