In [1]:
import pandas as pd
import json
from tqdm.notebook import trange, tqdm
from datetime import date
import requests
import time
import numpy as np
pd.set_option('display.max_columns', None) #настройка, чтобы видеть все колонки


In [2]:
observations_path = 'data/observations-285432.csv'
radiuses_dataset_path = 'data/radiuses_dataset.csv'
# start_date = 'min'
start_date = date(2022, 8, 31)
# finish_date = 'max'
finish_date = date(2022, 11, 30)
radiuses = (20, 200, 2000, 0)


In [3]:
def prepare_raritets(observations_path, radiuses, radiuses_dataset_path, start_date, finish_date):
    # готовит датасет со столбцами: taxon_id, taxon_rang, taxon_name, r20, r200, r2000, allinat

    # получаем датасет со столбцами всех таксонов и датами, а также start_date и finish_date
    df_taxons, start_date, finish_date = prepare_df(
        observations_path, start_date, finish_date)

    # получаем на start_date и finish_date:
    # (1) словари уникальных таксонов по рангам, (2) датасет с taxon_id и используемыми рангами,
    # (3) список рангов для которых нет названий, (4) ранги, для которых есть названия но не являющиеся листьями
    taxons_dict_start, df_taxons_clear_start, empty_taxons_start, notleafs_taxons_start = get_taxons_dict_to_date(
        df_taxons, start_date)
    taxons_dict_finish, df_taxons_clear_finish, empty_taxons_finish, notleafs_taxons_finish = get_taxons_dict_to_date(
        df_taxons, finish_date)

    # получаем на start_date и finish_date датасет со столбцами: taxon_id, taxon_rang, taxon_name
    taxons_df_start = taxon_dict_to_df(taxons_dict_start)
    taxons_df_finish = taxon_dict_to_df(taxons_dict_finish)

    # обновляем с inat датасет через udpate_radius, если необходимо,
    # создаём датасет со столбцами: taxon_id, taxon_rang, taxon_name, r20pos, r20, r200pos, r200, r2000pos, r2000, r0pos, r0
    if (update_radius(taxons_df_start, radiuses, radiuses_dataset_path, start_date) and \
            update_radius(taxons_df_finish, radiuses, radiuses_dataset_path, finish_date)) == True:
        radiuses_ds_start = get_radius(
            taxons_df_start, radiuses_dataset_path, start_date)
        radiuses_ds_finish = get_radius(
            taxons_df_finish, radiuses_dataset_path, finish_date)
    else:
        print('Can not update radiuses-quantity information')

    # получаем таблицу со столбцами: r20shiftpos, r20rang, r20count (с плюсом), r20common_name, r20science_name,
    #                               r200shiftpos, r200rang, r200count (с плюсом), r200common_name, r200science_name,
    #                               r2000shiftpos, r2000rang, r2000count (с плюсом), r2000common_name, r2000science_name,
    #                               r0_shiftpos, r0_rang, r0_count (с плюсом), r0_common_name, r0_science_name,
    # отсортированную по убыванию кол-ва в каждом столбце, только для рангов species, hybrid и form
    raritets_high = raritets_changes(
        obs_path=observations_path, radiuses_ds_start=radiuses_ds_start, radiuses_ds_finish=radiuses_ds_finish, how='high')

    # то же, но отсортированную по возрастанию в каждом столбце, для всех рангов
    raritets_micro = raritets_changes(
        obs_path=observations_path, radiuses_ds_start=radiuses_ds_start, radiuses_ds_finish=radiuses_ds_finish, how='micro')

    return raritets_high, raritets_micro


In [4]:
def prepare_df(observations_path, start_date, finish_date):

    df_full = pd.read_csv(observations_path)
    df = df_full.loc[:, 'taxon_id':'taxon_form_name'].copy()
    df.insert(0, 'created_at', '')
    df['created_at'] = pd.to_datetime(df_full['created_at']).dt.date
    del (df_full)
    start_date = min(df['created_at']) if start_date == 'min' else start_date
    finish_date = max(
        df['created_at']) if finish_date == 'max' else finish_date
    return df, start_date, finish_date


In [5]:
df_taxons, start_date, finish_date = prepare_df(
    observations_path, start_date, finish_date)
df_taxons.head(3)

df_taxons = df_taxons[df_taxons['taxon_id'] != 349797]

In [6]:
def get_taxons_dict_to_date(df, date_to):

    df = df_taxons[df_taxons['created_at'] <= date_to].copy()
    df.drop('created_at', axis=1, inplace=True)

    taxons_full = df.columns
    df.dropna(axis=1, how='all', inplace=True)
    not_empty_taxons = df.columns

    last_levels = df.apply(lambda x: x.last_valid_index(), axis=1)
    levels = df.columns[df.columns.isin(last_levels.unique())].to_list()
    df = df[['taxon_id']+levels].copy()

    empty_taxons = set(taxons_full) - set(not_empty_taxons)
    notleafs_taxons = set(not_empty_taxons) - set(df.columns)

    col_qty = len(df.columns)
    last_col = df.columns[col_qty-1]

    taxons_dict = dict()

    for i in range(1, col_qty):
        level_name = df.columns[i]
        taxons_dict[level_name] = dict()

    for i in range(1, col_qty):
        level_name = df.columns[i]
        level_taxons = df[last_levels == level_name][level_name].to_list()
        level_ids = df[last_levels == level_name].iloc[:, 0].to_list()
        taxons_dict[level_name].update(zip(level_ids, level_taxons))
        i += 1

    return taxons_dict, df, empty_taxons, notleafs_taxons


In [7]:
taxons_dict_start, df_taxons_clear_start, empty_taxons_start, notleafs_taxons_start = get_taxons_dict_to_date(
    df_taxons, start_date)
taxons_dict_finish, df_taxons_clear_finish, empty_taxons_finish, notleafs_taxons_finish = get_taxons_dict_to_date(
    df_taxons, finish_date)
print(taxons_dict_start.keys())
print(taxons_dict_finish.keys())


dict_keys(['taxon_class_name', 'taxon_family_name', 'taxon_subfamily_name', 'taxon_genus_name', 'taxon_species_name', 'taxon_hybrid_name', 'taxon_subspecies_name'])
dict_keys(['taxon_kingdom_name', 'taxon_class_name', 'taxon_subclass_name', 'taxon_order_name', 'taxon_superfamily_name', 'taxon_family_name', 'taxon_subfamily_name', 'taxon_genus_name', 'taxon_species_name', 'taxon_hybrid_name', 'taxon_subspecies_name'])


In [8]:
def taxon_dict_to_df(taxons_dict):
    df = pd.DataFrame(columns=['taxon_id', 'taxon_rang', 'taxon_name'])
    i = 0
    for taxon in taxons_dict.keys():
        for taxon_id in taxons_dict[taxon].keys():
            df.loc[i] = ([taxon_id, taxon, taxons_dict[taxon][taxon_id]])
            i += 1
    return df


In [9]:
taxons_df_start = taxon_dict_to_df(taxons_dict_start)
taxons_df_finish = taxon_dict_to_df(taxons_dict_finish)


In [10]:
def update_radius(df_tax_tocheck, radiuses, radiuses_dataset_path, date_to):
    # получает датасет со столбцом 'taxon_id'
    # проверяет, какие радиусы есть в csv, каких нет - запрашивает у update_radius, какие есть - берёт из csv, отдаёт
    # возвращает датафрейм с тремя столбцами по данным от iNat или из файла
    csb_col_set = ['taxon_id', 'radius', 'date']
    df_tax_asked = pd.DataFrame(columns=csb_col_set)
    date_to = str(date_to)
    for radius in radiuses:
        df_tax_asked_r = pd.DataFrame()
        df_tax_asked_r['taxon_id'] = df_tax_tocheck['taxon_id']
        df_tax_asked_r.insert(1, 'radius', radius)
        df_tax_asked_r.insert(2, 'date', date_to)
        df_tax_asked = pd.concat([df_tax_asked, df_tax_asked_r])
    df_tax_asked.reset_index(drop=True, inplace=True)
    print(f'Going to check in csv: {df_tax_asked.shape[0]} values')

    df_tax_csv = pd.read_csv(
        index_col=False, filepath_or_buffer=radiuses_dataset_path)
    print(f'Total in csv: {df_tax_csv.shape[0]} values ')

    check_radiuses = pd.merge(
        df_tax_asked, df_tax_csv, how='left', left_on=csb_col_set, right_on=csb_col_set)
    already_in_csv_sum = check_radiuses['count'].notnull().sum()
    print(f'Already in csv: {already_in_csv_sum} values')

    havenoradiuses = check_radiuses[check_radiuses['count'].isnull()].copy()

    if havenoradiuses.shape[0] > 0:
        print(f'Ask for {havenoradiuses.shape[0]} values from iNat')
        havenoradiuses.drop('count', axis=1, inplace=True)
        fetched = fetch_radius(havenoradiuses, radiuses_dataset_path)
        fetched_sum = fetched['count'].notnull().sum()
        print(f'Have fetched {fetched_sum} values from iNat')
        df_tax_csv = pd.concat([df_tax_csv, fetched])
        df_tax_csv.to_csv(path_or_buf=radiuses_dataset_path, index=False)
        del (df_tax_csv)
    else:
        print('No need to fetch from iNat')

    df_tax_csv = pd.read_csv(
        index_col=False, filepath_or_buffer=radiuses_dataset_path)
    print(f'Total in csv: {df_tax_csv.shape[0]} values ')

    if not (df_tax_csv.value_counts(subset=csb_col_set) > 1).any():
        print('No duplicates in csv')
    else:
        print('! Somehow duplicates in CSV !')

    check_radiuses = pd.merge(
        df_tax_asked, df_tax_csv, how='left', left_on=csb_col_set, right_on=csb_col_set)
    if check_radiuses['count'].notnull().all():
        print(True)
    else:
        print(False)

In [11]:
def fetch_radius(havenoradiuses, radiuses_dataset_path):

    # получает датасет со столбцами taxon_id, radius, date,
    # запрашивает у айнат,
    # сохраняет в csv,
    # возвращает датафрейм со столбцами 'taxon_id','radius','date','count'
    current_date = str.replace(str(date.today()), '-', '_')
    current_time = time.strftime("%H_%M_%S", time.localtime())
    temporal_txt_path = 'data/temp_file_' + current_date + '_' + current_time + '.csv'
    url = 'https://api.inaturalist.org/v1/observations'

    df = pd.DataFrame(columns=['taxon_id', 'radius', 'date', 'count'])

    with open(temporal_txt_path, 'a') as temp_file:
            temp_file.write('taxon_id,radius,date,count\n')

    i = 0
    for i in trange(havenoradiuses.shape[0]):
        taxon_id = havenoradiuses.iloc[i, 0]
        radius = havenoradiuses.iloc[i, 1]
        date_to = havenoradiuses.iloc[i, 2]
        if radius == 0:
            lat, lng, radius_param = '', '', ''
        else:
            lat = '55.494403'
            lng = '38.644662'
            radius_param = radius
        params = {
            'verifiable': 'true',
            'taxon_id': taxon_id,
            'd2': date_to,
            'lat': lat,
            'lng': lng,
            'radius': radius_param,
            'order': 'desc',
            'order_by': 'created_at',
            'only_id': 'true'
        }
        response = requests.get(url=url, params=params)
        count = response.json()['total_results']
        df.loc[i] = [taxon_id, radius, date_to, count]

        with open(temporal_txt_path, 'a') as temp_file:
            temp_file.write(','.join([str(taxon_id), str(radius), date_to, str(count)])+'\n')

        if response.status_code != 200:
            raise Exception('Oh response is not 200, it is ',
                            response.status_code)
        i += 1
        time.sleep(1)
        print(
            f'Done loop {i}: r{radius}, date {date_to}, response {response.status_code}, count {count}, id {taxon_id}')

    return df


In [12]:
update_radius(df_tax_tocheck=taxons_df_finish, radiuses=radiuses,
              radiuses_dataset_path=radiuses_dataset_path, date_to=start_date)
update_radius(df_tax_tocheck=taxons_df_finish, radiuses=radiuses,
              radiuses_dataset_path=radiuses_dataset_path, date_to=finish_date)

# old
# update_radius(df_tax_tocheck=taxons_df_start, radiuses=radiuses,
#               radiuses_dataset_path=radiuses_dataset_path, date_to=start_date)
# update_radius(df_tax_tocheck=taxons_df_finish, radiuses=radiuses,
#               radiuses_dataset_path=radiuses_dataset_path, date_to=finish_date)


Going to check in csv: 1996 values
Total in csv: 4002 values 
Already in csv: 1996 values
No need to fetch from iNat
Total in csv: 4002 values 
No duplicates in csv
True
Going to check in csv: 1996 values
Total in csv: 4002 values 
Already in csv: 1996 values
No need to fetch from iNat
Total in csv: 4002 values 
No duplicates in csv
True


In [62]:
def get_radius_info(taxons_df_start, taxons_df_finish, radiuses_dataset_path, start_date, finish_date):

        start_date = str(start_date)
        finish_date = str(finish_date)
        df_tax_csv = pd.read_csv(
                index_col=False, filepath_or_buffer=radiuses_dataset_path)
                
        sort_dict = {radiuses[i]:i+2 for i in range(0,len(radiuses))}
        sort_dict[start_date] = 0
        sort_dict[finish_date] = 1
        sort_dict['count_diff'] = 2
        sort_dict['pos_start'] = 3
        sort_dict['pos_finish'] = 4
        sort_dict['pos_diff'] = 5
        sort_list = [('pos_finish',radiuses[i]) for i in range(0, len(radiuses))]

        def get_radius(taxons_list, date_to):
                df = pd.DataFrame()
                df['taxon_id'] = taxons_list['taxon_id']
                df_tax_csv_todate = df_tax_csv[(df_tax_csv['date'] == date_to) & (df_tax_csv['taxon_id'].isin(taxons_df_finish['taxon_id']))]
                df = df.merge(df_tax_csv_todate, how='left')
                return df

        df_start = get_radius(taxons_df_finish, start_date)
        # df_start = get_radius(taxons_df_start, start_date)
        df_finish = get_radius(taxons_df_finish, finish_date)

        df_compact = pd.concat([df_start, df_finish]).pivot(index=['taxon_id'], columns=['date', 'radius'], values='count').copy()
        
        def get_cool_indexes(column):
                series_sorted = column.sort_values()
                positions = series_sorted.ne(series_sorted.shift()).cumsum()
                positions = positions.align(column)[0]
                return positions 


        df_diff = df_compact.loc[:,finish_date] - df_compact.loc[:,start_date]
        df_diff = pd.concat([df_diff], keys=['count_diff'], axis=1)
        
        df_pos_start = df_compact[df_compact[(start_date, radiuses[-1])].notnull()][start_date].apply(get_cool_indexes, axis=0)
        df_pos_start = pd.concat([df_pos_start], keys=['pos_start'], axis=1)

        df_pos_finish = df_compact[finish_date].apply(get_cool_indexes, axis=0)
        df_pos_finish = pd.concat([df_pos_finish], keys=['pos_finish'], axis=1)
        df_pos_finish = df_pos_finish.sort_values(by=sort_list)

        df_pos_diff = df_pos_start['pos_start'] - df_pos_finish['pos_finish']
        df_pos_diff = pd.concat([df_pos_diff], keys=['pos_diff'], axis=1)

        # df_compact = df_compact.drop(start_date, axis=1)

        df_pos = pd.concat([df_compact,  df_diff, df_pos_start, df_pos_finish, df_pos_diff], axis=1)
        df_pos = df_pos.reindex(index=df_pos_finish.index)
        df_pos.sort_index(axis=1, level=[0,1], key=lambda x: x.map(sort_dict),  inplace=True)

        # old_idx = df_pos.index.to_frame()
        # old_idx.insert(0, 'taxon_rang', taxons_df_finish['taxon_rang'].to_list())
        # old_idx.insert(2, 'taxon_name', taxons_df_finish['taxon_name'].to_list())
        # df_pos.index = pd.MultiIndex.from_frame(old_idx)

        return df_pos

In [90]:
df_pos = get_radius_info(taxons_df_start, taxons_df_finish, radiuses_dataset_path, start_date, finish_date)
df_pos = df_pos.astype('Int64')    
df_pos.insert(0,('ifnew','new'), '') 
df_pos.dtypes 
# df_pos.loc[~df_pos.index.isin(taxons_df_start['taxon_id']),:] = 'new'
df_pos[('ifnew','new')].apply(lambda x: 'new' if x.isin(taxons_df_start['taxon_id']) else '')

# df_pos

AttributeError: 'str' object has no attribute 'isin'

In [66]:
df_pos[('ifnew','new')]

taxon_id
900207     full
1070919    full
556219     full
227391     full
210241     full
           ... 
1094814    full
790553     full
47434      full
50814      full
47170      full
Name: (ifnew, new), Length: 499, dtype: object

In [58]:
df_pos

date,ifnew,2022-08-31,2022-08-31,2022-08-31,2022-08-31,2022-11-30,2022-11-30,2022-11-30,2022-11-30,count_diff,count_diff,count_diff,count_diff,pos_start,pos_start,pos_start,pos_start,pos_finish,pos_finish,pos_finish,pos_finish,pos_diff,pos_diff,pos_diff,pos_diff
radius,new,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0
taxon_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
900207,new,0,2,26,27,1,3,27,28,1,1,1,1,1,2,1,1,1,1,1,1,0,1,0,0
1070919,,1,3,67,244,1,3,67,251,0,0,0,7,2,3,3,8,1,1,3,5,1,2,0,3
556219,,1,3,135,225,1,4,145,258,0,1,10,33,2,3,13,5,1,2,12,6,1,1,1,-1
227391,new,0,6,76,279,1,7,91,311,1,1,15,32,1,4,4,10,1,4,4,8,0,0,0,2
210241,new,0,8,42,784,1,10,66,1115,1,2,24,331,1,5,2,38,1,5,2,46,0,0,0,-8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094814,new,232,22023,211917,2609734,355,28651,279291,2996359,123,6628,67374,386625,79,414,486,492,81,426,487,495,-2,-12,-1,-3
790553,new,334,21911,129176,634198,363,22340,132621,653432,29,429,3445,19234,80,413,484,489,82,424,485,492,-2,-11,-1,-3
47434,new,456,26457,184905,1435642,507,27114,191714,1553788,51,657,6809,118146,82,415,485,491,83,425,486,494,-1,-10,-1,-3
50814,,397,39634,382226,4472685,605,49155,477589,5068911,208,9521,95363,596226,81,416,487,493,84,427,488,496,-3,-11,-1,-3


In [15]:
old_idx = df_pos.index.to_frame()
# old_idx.insert(0, 'taxon_rang', taxons_df_finish['taxon_rang'].to_list())
old_idx.insert(0, 'new_taxon, taxons_df_finish['taxon_rang'].to_list())
old_idx.insert(2, 'taxon_name', taxons_df_finish['taxon_name'].to_list())
df_pos.index = pd.MultiIndex.from_frame(old_idx)


SyntaxError: invalid syntax (1289554512.py, line 3)

In [None]:
df_pos.loc[~df_pos.index.isin(taxons_df_start['taxon_id']),:] = 'new'

In [31]:
df_pos.insert(0,('ifnew','new'), '')
df_pos

ValueError: cannot insert ('ifnew', 'new'), already exists

In [32]:
df_pos

date,ifnew,2022-08-31,2022-08-31,2022-08-31,2022-08-31,2022-11-30,2022-11-30,2022-11-30,2022-11-30,count_diff,count_diff,count_diff,count_diff,pos_start,pos_start,pos_start,pos_start,pos_finish,pos_finish,pos_finish,pos_finish,pos_diff,pos_diff,pos_diff,pos_diff
radius,new,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0
taxon_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
900207,,0,2,26,27,1,3,27,28,1,1,1,1,1,2,1,1,1,1,1,1,0,1,0,0
1070919,,1,3,67,244,1,3,67,251,0,0,0,7,2,3,3,8,1,1,3,5,1,2,0,3
556219,,1,3,135,225,1,4,145,258,0,1,10,33,2,3,13,5,1,2,12,6,1,1,1,-1
227391,,0,6,76,279,1,7,91,311,1,1,15,32,1,4,4,10,1,4,4,8,0,0,0,2
210241,,0,8,42,784,1,10,66,1115,1,2,24,331,1,5,2,38,1,5,2,46,0,0,0,-8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094814,,232,22023,211917,2609734,355,28651,279291,2996359,123,6628,67374,386625,79,414,486,492,81,426,487,495,-2,-12,-1,-3
790553,,334,21911,129176,634198,363,22340,132621,653432,29,429,3445,19234,80,413,484,489,82,424,485,492,-2,-11,-1,-3
47434,,456,26457,184905,1435642,507,27114,191714,1553788,51,657,6809,118146,82,415,485,491,83,425,486,494,-1,-10,-1,-3
50814,,397,39634,382226,4472685,605,49155,477589,5068911,208,9521,95363,596226,81,416,487,493,84,427,488,496,-3,-11,-1,-3


date,ifnew,2022-08-31,2022-08-31,2022-08-31,2022-08-31,2022-11-30,2022-11-30,2022-11-30,2022-11-30,count_diff,count_diff,count_diff,count_diff,pos_start,pos_start,pos_start,pos_start,pos_finish,pos_finish,pos_finish,pos_finish,pos_diff,pos_diff,pos_diff,pos_diff
radius,new,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0
taxon_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
900207,new,0,2,26,27,1,3,27,28,1,1,1,1,1,2,1,1,1,1,1,1,0,1,0,0
227391,new,0,6,76,279,1,7,91,311,1,1,15,32,1,4,4,10,1,4,4,8,0,0,0,2
210241,new,0,8,42,784,1,10,66,1115,1,2,24,331,1,5,2,38,1,5,2,46,0,0,0,-8
77216,new,0,12,93,1042,1,13,96,1076,1,1,3,34,1,8,7,48,1,6,5,44,0,2,2,4
341438,new,0,10,151,521,1,14,243,775,1,4,92,254,1,6,14,20,1,7,19,31,0,-1,-5,-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48670,new,228,5637,124660,1229460,246,5922,132563,1321776,18,285,7903,92316,78,406,483,490,80,416,484,493,-2,-10,-1,-3
1094814,new,232,22023,211917,2609734,355,28651,279291,2996359,123,6628,67374,386625,79,414,486,492,81,426,487,495,-2,-12,-1,-3
790553,new,334,21911,129176,634198,363,22340,132621,653432,29,429,3445,19234,80,413,484,489,82,424,485,492,-2,-11,-1,-3
47434,new,456,26457,184905,1435642,507,27114,191714,1553788,51,657,6809,118146,82,415,485,491,83,425,486,494,-1,-10,-1,-3


In [38]:
df_pos

date,ifnew,2022-08-31,2022-08-31,2022-08-31,2022-08-31,2022-11-30,2022-11-30,2022-11-30,2022-11-30,count_diff,count_diff,count_diff,count_diff,pos_start,pos_start,pos_start,pos_start,pos_finish,pos_finish,pos_finish,pos_finish,pos_diff,pos_diff,pos_diff,pos_diff
radius,new,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0,20,200,2000,0
taxon_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
900207,,0,2,26,27,1,3,27,28,1,1,1,1,1,2,1,1,1,1,1,1,0,1,0,0
1070919,,1,3,67,244,1,3,67,251,0,0,0,7,2,3,3,8,1,1,3,5,1,2,0,3
556219,,1,3,135,225,1,4,145,258,0,1,10,33,2,3,13,5,1,2,12,6,1,1,1,-1
227391,,0,6,76,279,1,7,91,311,1,1,15,32,1,4,4,10,1,4,4,8,0,0,0,2
210241,,0,8,42,784,1,10,66,1115,1,2,24,331,1,5,2,38,1,5,2,46,0,0,0,-8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094814,,232,22023,211917,2609734,355,28651,279291,2996359,123,6628,67374,386625,79,414,486,492,81,426,487,495,-2,-12,-1,-3
790553,,334,21911,129176,634198,363,22340,132621,653432,29,429,3445,19234,80,413,484,489,82,424,485,492,-2,-11,-1,-3
47434,,456,26457,184905,1435642,507,27114,191714,1553788,51,657,6809,118146,82,415,485,491,83,425,486,494,-1,-10,-1,-3
50814,,397,39634,382226,4472685,605,49155,477589,5068911,208,9521,95363,596226,81,416,487,493,84,427,488,496,-3,-11,-1,-3


In [25]:
x=900207
x.isin(taxons_df_start['taxon_id'])

AttributeError: 'int' object has no attribute 'isin'

taxon_id
900207      900207
227391      227391
210241      210241
77216        77216
341438      341438
            ...   
48670        48670
1094814    1094814
790553      790553
47434        47434
47170        47170
Name: taxon_id, Length: 255, dtype: int64

In [None]:
len(sorted(list(set(taxons_df_finish['taxon_id'].to_list()) - set(taxons_df_start['taxon_id'].to_list()))))

255

In [None]:
df_taxons[df_taxons['created_at']<=start_date][['taxon_id']].drop_duplicates()
df_taxons[df_taxons['created_at']<=finish_date][['taxon_id','']].drop_duplicates().sort_values(by='taxon_id')

KeyError: "[''] not in index"

In [None]:

# df_compact.apply((lambda x: x+x.name), axis=1)
df_compact

date,2022-08-31,2022-08-31,2022-08-31,2022-08-31,2022-11-30,2022-11-30,2022-11-30,2022-11-30
radius,20,200,2000,0,20,200,2000,0
taxon_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
4514,12.0,62.0,589.0,1610.0,12.0,62.0,600.0,1681.0
7823,124.0,9863.0,87637.0,548251.0,140.0,10525.0,94960.0,587604.0
17871,,,,,19.0,3300.0,23902.0,40607.0
25488,,,,,14.0,398.0,2380.0,3097.0
25591,28.0,1680.0,10508.0,28382.0,39.0,1736.0,11192.0,29781.0
...,...,...,...,...,...,...,...,...
1123509,1.0,97.0,503.0,643.0,2.0,122.0,578.0,747.0
1290817,,,,,6.0,556.0,5713.0,18857.0
1321226,,,,,14.0,259.0,5963.0,10249.0
1366821,,,,,13.0,402.0,2041.0,22093.0


In [None]:
def get_taxproperty(taxon_id, taxproperty):
    info = taxons_df_finish[taxons_df_finish['taxon_id'] == taxon_id][taxproperty].values[0]
    return info

taxon_id = 349797
taxproperty = 'taxon_rang'

a = get_taxproperty(taxon_id, taxproperty)
a

'taxon_species_name'

In [None]:
column_to_sort_by_1 = (str(finish_date),radiuses[0])
column_to_sort_by_2 = (str(finish_date),radiuses[1])
df_to_sort = df_compact.sort_values(by=[column_to_sort_by_1,column_to_sort_by_2])
df_to_sort



In [None]:
# обновляем с inat датасет через udpate_radius, если необходимо,
# создаём датасет со столбцами: taxon_id, taxon_rang, taxon_name, r20pos, r20, r200pos, r200, r2000pos, r2000, r0pos, r0
if (update_radius(taxons_df_start, radiuses, radiuses_dataset_path, start_date) and \
        update_radius(taxons_df_finish, radiuses, radiuses_dataset_path, finish_date)) == True:
    radiuses_ds_start = get_radius(
        taxons_df_start, radiuses_dataset_path, start_date)
    radiuses_ds_finish = get_radius(
        taxons_df_finish, radiuses_dataset_path, finish_date)
else:
    print('Can not update radiuses-quantity information')

# получаем таблицу со столбцами: r20shiftpos, r20rang, r20count (с плюсом), r20common_name, r20science_name,
#                               r200shiftpos, r200rang, r200count (с плюсом), r200common_name, r200science_name,
#                               r2000shiftpos, r2000rang, r2000count (с плюсом), r2000common_name, r2000science_name,
#                               r0_shiftpos, r0_rang, r0_count (с плюсом), r0_common_name, r0_science_name,
# отсортированную по убыванию кол-ва в каждом столбце, только для рангов species, hybrid и form

Going to check in csv: 980 values
Total in csv: 3090 values 
Already in csv: 980 values
No need to fetch from iNat
Total in csv: 3090 values 
No duplicates in csv
True
Can not update radiuses-quantity information
