In [1]:
from entities.other_connection import OtherConnection
from entities.transactions import SupportTransaction
import followthemoney as ftm
import pandas as pd
from csv_processing.utils import add_id_prefix

In [3]:
df = pd.read_csv('../data1/support2021.csv')

In [4]:
ref_type = pd.read_csv('../data1/support_type_ref.csv')
ref_form = pd.read_csv('../data1/support_form_ref.csv')

In [5]:
# Loading support forms and types description (translated by google translate!)
support_forms = ref_form.set_index('id').to_dict()['english_version']
support_types = ref_type.set_index('id').to_dict()['english_version']

df['form'] = df['form_id'].apply(support_forms.get)
df['type'] = df['type_id'].apply(support_types.get)

In [7]:
df['form'].value_counts()

Financial support      3796037
Consulting support     1516093
Educational support     493066
Information support     211449
Property support         54223
Innovative support        5547
Name: form, dtype: int64

In [8]:
# Support docs has only INN-keys, 
# so we need to check if INN is for a company,
# and we can use our primary key OGRN 

companies = pd.concat([pd.read_csv('../data1/org_2022-04-27_08:31:04.csv'),
                       pd.read_csv('../data1/org_2022-03-17_20:33:51.csv')])
inn2ogrn = companies.set_index('inn').to_dict()['ogrn']


def make_id(inn):
    ogrn = inn2ogrn.get(inn)
    if ogrn is None:
        return add_id_prefix(fix_inn(inn), 'inn')
    return add_id_prefix(ogrn, 'ogrn')

df['inn'] = df['inn'].apply(make_id)
df['from_inn'] = df['from_inn'].apply(make_id)

In [20]:
# Separating financial support from other types
financial_support = df[df['form_id'] == 100]
other_support = df[df['form_id'] != 100]

In [22]:
from tqdm import tqdm
data = []
for i, row in tqdm(financial_support.iterrows(), total=len(financial_support)):
    data.append(SupportTransaction(row).to_ftm().to_dict())
for i, row in tqdm(other_support.iterrows(), total=len(other_support)):
    data.append(OtherConnection(row).to_ftm().to_dict())

100%|█████████████████████████████████████████████████████████████████████| 3796037/3796037 [10:12<00:00, 6201.77it/s]
100%|█████████████████████████████████████████████████████████████████████| 2280378/2280378 [05:51<00:00, 6482.47it/s]


In [31]:
import json
import gzip

jsonfilename = 'output/ru-egrul-transactions.json.gzip'

# WRITE
json_str = json.dumps(data) + "\n"               
json_bytes = json_str.encode('utf-8')   

with gzip.open(jsonfilename, 'w') as fout:    
    fout.write(json_bytes) 