In [1]:
import pandas as pd
import json
from tqdm.notebook import trange, tqdm
from datetime import date
import requests
import time

In [2]:
observations_path = 'data/observations-285432.csv'
radiuses_dataset_path = 'data/radiuses_dataset.csv'
# start_date = 'min'
start_date = date(2022,8,31)
# finish_date = 'max'
finish_date = date(2022,11,30)
radiuses = [20, 200]

In [3]:
def prepare_raritets(observations_path, radiuses, radiuses_dataset_path, start_date, finish_date):
    # готовит датасет со столбцами: taxon_id, taxon_rang, taxon_name, r20, r200, r2000, allinat
    
    # получаем датасет со столбцами всех таксонов и датами, а также start_date и finish_date
    df_taxons, start_date, finish_date = prepare_df(observations_path, start_date, finish_date)

    # получаем на start_date и finish_date: 
    # (1) словари уникальных таксонов по рангам, (2) датасет с taxon_id и используемыми рангами,
    # (3) список рангов для которых нет названий, (4) ранги, для которых есть названия но не являющиеся листьями
    taxons_dict_start, df_taxons_clear_start, empty_taxons_start, notleafs_taxons_start = get_taxons_dict_to_date(df_taxons, start_date)
    taxons_dict_finish, df_taxons_clear_finish, empty_taxons_finish, notleafs_taxons_finish = get_taxons_dict_to_date(df_taxons, finish_date)
    
    # получаем на start_date и finish_date датасет со столбцами: taxon_id, taxon_rang, taxon_name
    taxons_df_start = taxon_dict_to_df(taxons_dict_start)
    taxons_df_finish = taxon_dict_to_df(taxons_dict_finish)

    # обновляем с inat датасет через udpate_radius, если необходимо,
    # создаём датасет со столбцами: taxon_id, taxon_rang, taxon_name, r20, r20pos, r200, r200pos, r2000, r2000pos, ai, ai_pos
    if update_radius(taxons_df_start, radiuses, radiuses_dataset_path, start_date) and \
    update_radius(taxons_df_finish, radiuses, radiuses_dataset_path, finish_date) == True:
        radiuses_ds_start = get_radius(taxons_df_start, radiuses_dataset_path,start_date)
        radiuses_ds_finish = get_radius(taxons_df_finish, radiuses_dataset_path, finish_date)
    else:
        print('Can not update radiuses-quantity information')

    # получаем таблицу со столбцами: r20shiftpos, r20rang, r20count (с плюсом), r20common_name, r20science_name, 
    #                               r200shiftpos, r200rang, r200count (с плюсом), r200common_name, r200science_name, 
    #                               r2000shiftpos, r2000rang, r2000count (с плюсом), r2000common_name, r2000science_name, 
    #                               ai_shiftpos, ai_rang, ai_count (с плюсом), ai_common_name, ai_science_name, 
    # отсортированную по убыванию кол-ва в каждом столбце, только для рангов species, hybrid и form
    raritets_high = raritets_changes(obs_path=observations_path, radiuses_ds_start=radiuses_ds_start, radiuses_ds_finish=radiuses_ds_finish, how='high')

    # то же, но отсортированную по возрастанию в каждом столбце, для всех рангов
    raritets_micro = raritets_changes(obs_path=observations_path, radiuses_ds_start=radiuses_ds_start, radiuses_ds_finish=radiuses_ds_finish, how='micro')

    return raritets_high, raritets_micro

In [4]:
def prepare_df(observations_path, start_date, finish_date):
    
    df_full = pd.read_csv(observations_path)
    df = df_full.loc[:,'taxon_id':'taxon_form_name'].copy()
    df.insert(0,'created_at','')
    df['created_at'] = pd.to_datetime(df_full['created_at']).dt.date
    del(df_full)
    start_date = min(df['created_at']) if start_date == 'min' else start_date
    finish_date = max(df['created_at']) if finish_date == 'max' else finish_date
    return df, start_date, finish_date

In [5]:
df_taxons, start_date, finish_date = prepare_df(observations_path, start_date, finish_date)
df_taxons.head(3)

Unnamed: 0,created_at,taxon_id,taxon_kingdom_name,taxon_phylum_name,taxon_subphylum_name,taxon_superclass_name,taxon_class_name,taxon_subclass_name,taxon_superorder_name,taxon_order_name,...,taxon_supertribe_name,taxon_tribe_name,taxon_subtribe_name,taxon_genus_name,taxon_genushybrid_name,taxon_species_name,taxon_hybrid_name,taxon_subspecies_name,taxon_variety_name,taxon_form_name
0,2018-09-24,1062676,Fungi,Ascomycota,Pezizomycotina,,Pezizomycetes,,,Pezizales,...,,,,Morchella,,,,,,
1,2018-09-24,1070919,Animalia,Arthropoda,Chelicerata,,Arachnida,,,Araneae,...,,Sitticini,,Attulus,,Attulus floricola,,,,
2,2019-05-26,373470,Animalia,Arthropoda,Hexapoda,,Insecta,Pterygota,,Coleoptera,...,,,,,,,,,,


In [6]:
def get_taxons_dict_to_date(df, date_to):

    df = df_taxons[df_taxons['created_at'] <= date_to].copy()
    df.drop('created_at', axis=1, inplace=True)
    
    taxons_full = df.columns
    df.dropna(axis=1, how='all', inplace=True)
    not_empty_taxons = df.columns

    last_levels = df.apply(lambda x: x.last_valid_index(), axis=1)
    levels = df.columns[df.columns.isin(last_levels.unique())].to_list()
    df = df[['taxon_id']+levels].copy()

    empty_taxons = set(taxons_full) - set(not_empty_taxons)
    notleafs_taxons = set(not_empty_taxons) - set(df.columns)

    col_qty = len(df.columns)
    last_col = df.columns[col_qty-1]

    taxons_dict = dict()

    for i in range(1, col_qty):
        level_name = df.columns[i]
        taxons_dict[level_name] = dict()

    for i in range(1, col_qty):
        level_name = df.columns[i]
        level_taxons = df[last_levels == level_name][level_name].to_list()
        level_ids = df[last_levels == level_name].iloc[:,0].to_list()
        taxons_dict[level_name].update(zip(level_ids, level_taxons))
        i += 1

    return taxons_dict, df, empty_taxons, notleafs_taxons

In [7]:
taxons_dict_start, df_taxons_clear_start, empty_taxons_start, notleafs_taxons_start = get_taxons_dict_to_date(df_taxons, start_date)
taxons_dict_finish, df_taxons_clear_finish, empty_taxons_finish, notleafs_taxons_finish = get_taxons_dict_to_date(df_taxons, finish_date)
print(taxons_dict_start.keys())
print(taxons_dict_finish.keys())

dict_keys(['taxon_class_name', 'taxon_family_name', 'taxon_subfamily_name', 'taxon_genus_name', 'taxon_species_name', 'taxon_hybrid_name', 'taxon_subspecies_name'])
dict_keys(['taxon_kingdom_name', 'taxon_class_name', 'taxon_subclass_name', 'taxon_order_name', 'taxon_superfamily_name', 'taxon_family_name', 'taxon_subfamily_name', 'taxon_genus_name', 'taxon_species_name', 'taxon_hybrid_name', 'taxon_subspecies_name'])


In [8]:
def taxon_dict_to_df(taxons_dict):
    df = pd.DataFrame(columns=['taxon_id','taxon_rang','taxon_name'])
    i = 0
    for taxon in taxons_dict.keys():
        for taxon_id in taxons_dict[taxon].keys():
            df.loc[i] = ([taxon_id, taxon, taxons_dict[taxon][taxon_id]])
            i += 1
    return df

In [9]:
taxons_df_start = taxon_dict_to_df(taxons_dict_start)
taxons_df_finish = taxon_dict_to_df(taxons_dict_finish)
taxons_df_start.head(3)

Unnamed: 0,taxon_id,taxon_rang,taxon_name
0,50814,taxon_class_name,Agaricomycetes
1,83797,taxon_family_name,Viviparidae
2,47416,taxon_family_name,Lycosidae


In [10]:
df_tax_tocheck = taxons_df_start.loc[0:10,:]
df_tax_tocheck

Unnamed: 0,taxon_id,taxon_rang,taxon_name
0,50814,taxon_class_name,Agaricomycetes
1,83797,taxon_family_name,Viviparidae
2,47416,taxon_family_name,Lycosidae
3,7823,taxon_family_name,Corvidae
4,373470,taxon_subfamily_name,Malachiinae
5,1062676,taxon_genus_name,Morchella
6,53490,taxon_genus_name,Suillus
7,83951,taxon_genus_name,Ammophila
8,52855,taxon_genus_name,Artemisia
9,52693,taxon_genus_name,Agrostis


In [27]:
def update_radius(df_tax_tocheck, radiuses, radiuses_dataset_path, date_to):
    # получает датасет со столбцом 'taxon_id'
    # проверяет, какие радиусы есть в csv, каких нет - запрашивает у update_radius, какие есть - берёт из csv, отдаёт  
    # возвращает датафрейм с тремя столбцами по данным от iNat или из файла

    df_tax_asked = pd.DataFrame(columns=['taxon_id','radius','date'])
    for radius in radiuses:
        df_tax_asked_r = pd.DataFrame()
        df_tax_asked_r['taxon_id'] = df_tax_tocheck['taxon_id']
        df_tax_asked_r.insert(1, 'radius', radius)
        df_tax_asked_r.insert(2, 'date', date_to)
        df_tax_asked = pd.concat([df_tax_asked, df_tax_asked_r])
    df_tax_asked.reset_index(drop=True)
    print(f'Going to check in csv: {df_tax_asked.shape[0]} values')

    df_tax_csv = pd.read_csv(index_col=False, filepath_or_buffer=radiuses_dataset_path)
    print(f'Total in csv: {df_tax_csv.shape[0]} values ')

    check_radiuses = pd.merge(df_tax_asked, df_tax_csv, how='left', left_on=['taxon_id','radius','date'], right_on=['taxon_id','radius','date'])
    already_in_csv_sum = check_radiuses['count'].notnull().sum()
    print(f'Already in csv: {already_in_csv_sum} values')

    havenoradiuses = check_radiuses[check_radiuses['count'].isnull()].copy()

    if havenoradiuses.shape[0] > 0:
        print(f'Ask for {havenoradiuses.shape[0]} values from iNat')
        havenoradiuses.drop('count', axis=1, inplace=True)
        fetched = fetch_radius(havenoradiuses, radiuses_dataset_path)
        fetched_sum = fetched['count'].notnull().sum()
        print(f'Have fetched {fetched_sum} values from iNat')
        df_tax_csv = pd.concat([df_tax_csv, fetched])
        df_tax_csv.to_csv(path_or_buf=radiuses_dataset_path,index=False)
        print(f'Now have {df_tax_csv.shape[0]} values in csv')

    df_tax_csv = pd.read_csv(index_col=False, filepath_or_buffer=radiuses_dataset_path)
    check_radiuses = pd.merge(df_tax_asked, df_tax_csv, how='left', left_on=['taxon_id','radius','date'], right_on=['taxon_id','radius','date'])

    if check_radiuses['count'].notnull().all():
        return True
    else:
        return False

In [32]:
df_tax_tocheck = taxons_df_start.loc[0:1,:]
date_to = start_date

In [35]:
df_tax_asked = pd.DataFrame(columns=['taxon_id','radius','date'])
for radius in radiuses:
    df_tax_asked_r = pd.DataFrame()
    df_tax_asked_r['taxon_id'] = df_tax_tocheck['taxon_id']
    df_tax_asked_r.insert(1, 'radius', radius)
    df_tax_asked_r.insert(2, 'date', date_to)
    df_tax_asked = pd.concat([df_tax_asked, df_tax_asked_r])
df_tax_asked.reset_index(drop=True)
print(f'Going to check in csv: {df_tax_asked.shape[0]} values')

df_tax_csv = pd.read_csv(index_col=False, filepath_or_buffer=radiuses_dataset_path)
print(f'Total in csv: {df_tax_csv.shape[0]} values ')

check_radiuses = pd.merge(df_tax_asked, df_tax_csv, how='left', left_on=['taxon_id','radius','date'], right_on=['taxon_id','radius','date'])
# already_in_csv_sum = check_radiuses['count'].notnull().sum()
# print(f'Already in csv: {already_in_csv_sum} values')

# havenoradiuses = check_radiuses[check_radiuses['count'].isnull()].copy()

# if havenoradiuses.shape[0] > 0:
#     print(f'Ask for {havenoradiuses.shape[0]} values from iNat')
#     havenoradiuses.drop('count', axis=1, inplace=True)
#     fetched = fetch_radius(havenoradiuses, radiuses_dataset_path)
#     fetched_sum = fetched['count'].notnull().sum()
#     print(f'Have fetched {fetched_sum} values from iNat')
#     df_tax_csv = pd.concat([df_tax_csv, fetched])
#     df_tax_csv.to_csv(path_or_buf=radiuses_dataset_path,index=False)
#     print(f'Now have {df_tax_csv.shape[0]} values in csv')

# df_tax_csv = pd.read_csv(index_col=False, filepath_or_buffer=radiuses_dataset_path)
# check_radiuses = pd.merge(df_tax_asked, df_tax_csv, how='left', left_on=['taxon_id','radius','date'], right_on=['taxon_id','radius','date'])



Going to check in csv: 4 values
Total in csv: 52 values 


In [42]:
df_tax_csv['date']

0     2022-08-31
1     2022-08-31
2     2022-08-31
3     2022-08-31
4     2022-08-31
5     2022-08-31
6     2022-08-31
7     2022-08-31
8     2022-08-31
9     2022-08-31
10    2022-08-31
11    2022-08-31
12    2022-08-31
13    2022-08-31
14    2022-08-31
15    2022-08-31
16    2022-08-31
17    2022-08-31
18    2022-08-31
19    2022-08-31
20    2022-08-31
21    2022-08-31
22    2022-08-31
23    2022-08-31
24    2022-08-31
25    2022-08-31
26    2022-08-31
27    2022-08-31
28    2022-08-31
29    2022-08-31
30    2022-08-31
31    2022-08-31
32    2022-08-31
33    2022-08-31
34    2022-08-31
35    2022-08-31
36    2022-08-31
37    2022-08-31
38    2022-08-31
39    2022-08-31
40    2022-08-31
41    2022-08-31
42    2022-08-31
43    2022-08-31
44    2022-08-31
45    2022-08-31
46    2022-08-31
47    2022-08-31
48    2022-08-31
49    2022-08-31
50    2022-08-31
51    2022-08-31
Name: date, dtype: object

In [45]:
df_tax_asked['date']

0    2022-08-31
1    2022-08-31
0    2022-08-31
1    2022-08-31
Name: date, dtype: object

In [43]:
check_radiuses.iloc[0,2]

datetime.date(2022, 8, 31)

In [40]:
check_radiuses = pd.merge(df_tax_asked, df_tax_csv, how='left', left_on=['taxon_id','radius','date'], right_on=['taxon_id','radius','date'])
check_radiuses

Unnamed: 0,taxon_id,radius,date,count
0,50814,20,2022-08-31,
1,83797,20,2022-08-31,
2,50814,200,2022-08-31,
3,83797,200,2022-08-31,


In [12]:
def fetch_radius(havenoradiuses, radiuses_dataset_path):
    
    # получает датасет со столбцами taxon_id, radius, date, 
    # запрашивает у айнат, 
    # сохраняет в csv,
    # возвращает датафрейм со столбцами 'taxon_id','radius','date','count'
    current_date = str.replace(str(date.today()), '-', '_')
    current_time = time.strftime("%H_%M_%S", time.localtime())
    temporal_df_path = 'data/temp_df_' + current_date + '_' + current_time + '.csv'
    df = pd.DataFrame(columns=['taxon_id','radius','date','count'])
    url = 'https://api.inaturalist.org/v1/observations'
    i = 0
    for i in trange(havenoradiuses.shape[0]):
        taxon_id = havenoradiuses.iloc[i,0]
        radius = havenoradiuses.iloc[i,1]
        date_to = havenoradiuses.iloc[i,2]
        params = {
        'verifiable':'true',
        'taxon_id':taxon_id,
        'd2':date_to,
        'lat':'55.494403',
        'lng':'38.644662',
        'radius':radius,
        'order':'desc',
        'order_by':'created_at',
        'only_id':'true'
        }
        response = requests.get(url=url, params=params)
        count = response.json()['total_results']
        df.loc[i] = [taxon_id, radius, date_to, count]
        df.to_csv(path_or_buf=temporal_df_path, index=False)
        if response.status_code != 200:
            raise Exception('Oh response is not 200, it is ',response.status_code) 
        i += 1
        time.sleep(1)
        print(f'Done loop {i}: id {taxon_id}, response {response.status_code}, count {count}')
        
    return df

In [13]:
update_radius(df_tax_tocheck=df_tax_tocheck, radiuses=radiuses, radiuses_dataset_path=radiuses_dataset_path, date_to=start_date)

Going to check in csv: 22 values
Total in csv: 10 values 
Already in csv: 0 values
Ask for 22 values from iNat


  0%|          | 0/22 [00:00<?, ?it/s]

Done loop 1: id 50814, response 200, count 397
Done loop 2: id 83797, response 200, count 12
Done loop 3: id 47416, response 200, count 21
Done loop 4: id 7823, response 200, count 124
Done loop 5: id 373470, response 200, count 2
Done loop 6: id 1062676, response 200, count 2
Done loop 7: id 53490, response 200, count 4
Done loop 8: id 83951, response 200, count 3
Done loop 9: id 52855, response 200, count 107
Done loop 10: id 52693, response 200, count 19
Done loop 11: id 50829, response 200, count 174
Done loop 12: id 50814, response 200, count 39641
Done loop 13: id 83797, response 200, count 102
Done loop 14: id 47416, response 200, count 983
Done loop 15: id 7823, response 200, count 9863
Done loop 16: id 373470, response 200, count 212
Done loop 17: id 1062676, response 200, count 116
Done loop 18: id 53490, response 200, count 301
Done loop 19: id 83951, response 200, count 86
Done loop 20: id 52855, response 200, count 6376
Done loop 21: id 52693, response 200, count 712
Done 

False

In [14]:
taxon_list = taxons_df_finish['taxon_id'][0:10]
taxon_list

0      47170
1      50814
2      48372
3    1094814
4      48427
5      55484
6      52380
7      48195
8      83797
9      47416
Name: taxon_id, dtype: int64

In [15]:
# taxon_list = taxons_df_finish['taxon_id'][0:10]
# date_to = date(2022,8,31)
# first_radiuses = update_radius(taxon_list=taxon_list, radius=200, date_to=date_to)
# first_radiuses.to_csv(path_or_buf=radiuses_dataset_path,index=False)

In [16]:
# df = pd.read_csv(index_col=False, filepath_or_buffer=radiuses_dataset_path)
# saved_radiuses = df.copy()
# ask_radiuses = df.copy()
# ask_radiuses.drop('quantity', axis=1, inplace=True)
# # ask_radiuses.insert(3, 'quantity', '')
# saved_radiuses.iloc[0] = ['500','200','2023-01-25', '100']
# saved_radiuses.iloc[2] = ['48372','200','2023-01-25', '100']
# saved_radiuses.iloc[3] = ['48372','555','2023-01-25', '100']
# saved_radiuses.iloc[5] = ['848317','200','2023-12-31', '100']
# ask_radiuses