In [1]:
import pandas as pd
import random
import os

from datetime import date
from tqdm import tqdm_notebook
from shutil import copy

In [2]:
'''
Import data, previously filtered in the script 01_addcontr_explore (contract value > 1 000 000 UAH)
'''
columns = ['addcontr_date','rationale_type','rationale','contract_id','date_signed','contract_end','tender_id','procuring_entity','contract_value','region','tenderURL']

data_addcontr_gas_short = pd.read_excel('02_data/addcontr_gas_1mln_v1.xlsx')[columns]
data_addcontr_gas_short.loc[:,'date_signed'] = data_addcontr_gas_short.loc[:,'date_signed'].map(lambda x: x.date())
data_addcontr_gas_short.loc[:,'addcontr_date'] = data_addcontr_gas_short.loc[:,'addcontr_date'].map(lambda x: x.date() if x != 'unknown' else x)
data_addcontr_gas_short.loc[:,'contract_end'] = data_addcontr_gas_short.loc[:,'contract_end'].map(lambda x: x.date() if x != 'not specified' else x)

data_addcontr_gas_short.loc[:,'price_start'] = ""
data_addcontr_gas_short.loc[:,'price_addcontr'] = ""
data_addcontr_gas_short.loc[:,'row_added_manually'] = False

print(data_addcontr_gas_short.shape)
data_addcontr_gas_short.head(3)

(7127, 14)


Unnamed: 0,addcontr_date,rationale_type,rationale,contract_id,date_signed,contract_end,tender_id,procuring_entity,contract_value,region,tenderURL,price_start,price_addcontr,row_added_manually
0,2017-10-10,"volumeCuts, itemPriceVariation",Згідно довідки Торгово-промислової палати Укра...,244af7a6ee364ed3a8b3cff3cedabc3f,2017-07-31,2017-12-30,UA-2017-05-15-002292-b,"Державний вищий навчальний заклад ""Університет...",1093519.824,Київська область,https://prozorro.gov.ua/tender/UA-2017-05-15-0...,,,False
1,2017-08-29,volumeCuts,Зменшення обсягів закупівлі з метою приведення...,28cf7ff64b564839b13c8c91c98676dc,2017-08-02,2017-12-31,UA-2017-06-21-000460-b,Квартирно - експлуатаційний відділ м. Херсона,1124468.37,Херсонська область,https://prozorro.gov.ua/tender/UA-2017-06-21-0...,,,False
2,2017-09-20,itemPriceVariation,Підвищення вартості природного газу на ринку У...,28cf7ff64b564839b13c8c91c98676dc,2017-08-02,2017-12-31,UA-2017-06-21-000460-b,Квартирно - експлуатаційний відділ м. Херсона,1124468.37,Херсонська область,https://prozorro.gov.ua/tender/UA-2017-06-21-0...,,,False


In [3]:
'''
Function to create an extract based on randomly selected contract ids,
also makes a copy of docs for respective contract ids,
which later will be sent to a person (student or volunteer), who is respobsible for manual work with the data.

Please, take into account, that only a tiny sample of docs is uploaded to github,
so this function will probably not work from the very beginning.
You can fix this by dowloading all the docs from ProZorro with the script 00_addcontr_get_data.
It is more than 10 Gb of data and it might take several hours to download all of them.
'''

def create_xlsx_and_filecopies(name, random_contracts):
    number = 1
    filename = '%s%i.xlsx' % (name, number)

    while filename in os.listdir('03_for_students/01_distribution_data'):
        number += 1
        filename = '%s%i.xlsx' % (name, number)
        
    writer = pd.ExcelWriter('03_for_students/01_distribution_data/%s' % filename)
    data_addcontr_gas_short_fordistr.loc[data_addcontr_gas_short_fordistr['contract_id'].isin(random_contracts)].to_excel(writer,'contracts_short',index = False)
    pd.DataFrame(columns = ['contract_id','month','year','amount_1000m3','supply_implied']).to_excel(writer,'schedule',index = False)
    pd.DataFrame(columns = ['contract_id','contract_value','price_start','total_supply','months_of_supply','monthly_supply']).to_excel(writer,'imply_supply',index = False)
    writer.save()
    print("Збережено тестове завдання " + filename)
    
    base_folder = '00_contracts_pdf'
    foldername = filename.split('.')[0]
    destination_folder = os.path.join('03_for_students/01_distribution_data',foldername)
    
    for random_contract in random_contracts:
        date_signed = data_addcontr_gas_short_fordistr.loc[data_addcontr_gas_short_fordistr['contract_id'] == random_contract].iloc[0,:]['date_signed']
        current_id = data_addcontr_gas_short_fordistr.loc[data_addcontr_gas_short_fordistr['contract_id'] == random_contract].iloc[0,:]['contract_id']
        source = os.path.join(base_folder,str(date_signed),current_id)
        destination = os.path.join(destination_folder,str(date_signed),current_id)

        for file in os.listdir(source):
            if not os.path.exists(destination):
                os.makedirs(destination)
            try:
                copy(os.path.join(source,file),destination)
            except FileNotFoundError:
                print("Не вдалося скопіювати файл %s (шлях %s)" % (file, destination))
            
    return filename

In [5]:
'''
Randomly select 5 contracts for creating a test task for a student or volunteer,
only takes data, which has not been distributed already
'''
used_contract_ids = pd.read_csv('03_for_students/distributed_ids.csv')
print("Розподілено %i договорів" %(len(used_contract_ids)))

data_addcontr_gas_short_fordistr = data_addcontr_gas_short.loc[~data_addcontr_gas_short['contract_id'].isin(used_contract_ids['contract_id'])]
print("Для розподілу доступно %i рядків" %len(data_addcontr_gas_short_fordistr))

random_contracts = []

while len(random_contracts) < 5:
    chosen = random.choice(data_addcontr_gas_short_fordistr['contract_id'].unique())
    if not chosen in random_contracts:
        random_contracts.append(chosen)
print(random_contracts)

filename = create_xlsx_and_filecopies('addcontr_gas_test', random_contracts)
filenames = [filename for contract in random_contracts]

used_contract_ids = used_contract_ids.append(pd.DataFrame({'contract_id':random_contracts,'document_name':filenames}), ignore_index = True)
used_contract_ids.to_csv('03_for_students/distributed_ids.csv', index = False)

print("Оновлено: розподілено %i договорів" %(len(used_contract_ids)))

Розподілено 354 договорів
Для розподілу доступно 5081 рядків
['64c5d17bccad44229c99432280fdca81', '4ebb5b5ca700461da9d3987a3ddda5fd', '0ba29c5123b941b0a01a1b1a62389a68', '2961792da4814646bda187a5aba1cf10', '41f206eb5c1d4e35b38c3d628cc419cc']
Збережено тестове завдання addcontr_gas_test23.xlsx
Оновлено: розподілено 359 договорів


In [4]:
'''
Randomly select approximately 200 rows of data for creating a standard task for a student or volunteer,
only takes data, which has not been distributed already
'''
used_contract_ids = pd.read_csv('03_for_students/distributed_ids.csv')
print("Розподілено %i договорів" %(len(used_contract_ids)))

data_addcontr_gas_short_fordistr = data_addcontr_gas_short.loc[~data_addcontr_gas_short['contract_id'].isin(used_contract_ids['contract_id'])]
print("Для розподілу доступно %i рядків" %len(data_addcontr_gas_short_fordistr))

random_contracts = []

if len(data_addcontr_gas_short_fordistr) >= 200:
    while len(data_addcontr_gas_short_fordistr.loc[data_addcontr_gas_short_fordistr['contract_id'].isin(random_contracts)]) < 200:
        chosen = random.choice(data_addcontr_gas_short_fordistr['contract_id'].unique())
        if not chosen in random_contracts:
            random_contracts.append(chosen)
else:
    random_contracts = list(data_addcontr_gas_short_fordistr['contract_id'].unique())

print(random_contracts)

filename = create_xlsx_and_filecopies('addcontr_gas_extract', random_contracts)
filenames = [filename for contract in random_contracts]

used_contract_ids = used_contract_ids.append(pd.DataFrame({'contract_id':random_contracts,'document_name':filenames}), ignore_index = True)
used_contract_ids.to_csv('03_for_students/distributed_ids.csv', index = False)

print("Оновлено: розподілено %i договорів" %(len(used_contract_ids)))

Розподілено 1195 договорів
Для розподілу доступно 182 рядків
['852ee7757e8248f8a127dde4d964d9ab', 'fb80d6eb1c534698b463b2428f51d2b4', '969afe72136d4a97a678f4e52d941993', 'c1bac663a35f4c4ba54df791a85c7560', 'e879b98bbebd471286c315182346998f', '700359ac7ee34f88a3afb1df0f2585eb', '5ea45bf8db4b4b07a773c98439e47b4c', 'ec20f3f8f7794c10b2dd5181fa7a97de', '6b8b4c3d08404a07ba59e4bb6b6a407e', '1c4f45261fd24389a91a32f0654fd927', '6f049658df4b460592f6a04f10fbdd1e', '157ad459e9f342fbbfd7c5274be2a7f0', '3c547fc341f847759c71eaadcf9939fc', 'b37544508b0a467ab4ab76106295b8a3', 'b53af271b58a4aa9a37818f24fdaf44b', 'a1611ddfea254ea3956b80d4bfe840ae', '9574c968a51746f3b3cad7359a31b5c5', 'e2eb526a02aa4299a6abd4b3554c35d9', '3217b2b3dad345acbe5d43fdc58b8ebe', '91b52380228947bebff552cf660685e0', '9e21b15d979d400fb2d95aed775695c6', '27b49557cb314088ba2923cd62874235', 'b65c5a0d03e243e5bd9695abf4f1c30b', '50eeec3c096f4367be088248d60f0caa', 'b3eeeebb5c314b4dab1b37e6c97dc285', '9dd81ae8442b4fcbbc5eece6bad8e1d0', 'd

In [5]:
'''
Need this code to double check, if no contract ids were distributed twice
Is used in case of baseless paranoia
'''
used_contract_ids_test = pd.read_csv('03_for_students/distributed_ids.csv')
print("Кількість contract_id у списку розподілених: %i" % len(used_contract_ids_test))
print("Кількість унікальних contract_id у списку розподілених: %i" %len(used_contract_ids_test['contract_id'].unique()))

Кількість contract_id у списку розподілених: 1225
Кількість унікальних contract_id у списку розподілених: 1225
