In [369]:
from urllib.request import urlopen
import json, re
import numpy as np
import pandas as pd
import time

In [370]:
# access data from PBDB
start = time.time()
url = 'https://paleobiodb.org/data1.2/occs/strata.json?interval=Phanerozoic&textresult'
r = urlopen(url)
result= r.read().decode('utf-8')
data_strata = json.loads(result)

url = 'https://paleobiodb.org/data1.2/intervals/list.json?all_records'
r = urlopen(url)
# This should put the response from API in a Dict
result= r.read().decode('utf-8')
data_intvl = json.loads(result)

#print(res.keys())

In [371]:
# create a list of reference ids for PBDB URL
def get_vals(nested, key):
    result = []
    if isinstance(nested, list) and nested != []:   #non-empty list
        for lis in nested:
            result.extend(get_vals(lis, key))
    elif isinstance(nested, dict) and nested != {}:   #non-empty dict
        for val in nested.values():
            if isinstance(val, (list, dict)):   #(list or dict) in dict
                result.extend(get_vals(val, key))
        if key in nested.keys():   #key found in dict
            result.append(nested[key])
    return result

refids=get_vals(data_intvl, 'rid') # list in form 
refids = sum(refids, [])# list in form "ref:12345"
new_refids=[]
#print(refids)
for string in refids:
    new_string = string.replace("ref:", "")
    new_refids.append(new_string)
new_refids = [int(i) for i in new_refids] # list of reference id's as integers
#print(new_refids)

In [372]:
# add reference ids to URL string
url_raw="https://paleobiodb.org/data1.2/refs/list.json?ref_id="
refids_string = ' '.join([str(elem) for elem in new_refids])
refids_string = refids_string.replace(" ", ",")
url1=url_raw+refids_string
#print(url)

In [373]:
# access PBDB references
r = urlopen(url1)
# This should put the response from API in a Dict
result= r.read().decode('utf-8')
data_PB_refs = json.loads(result)

In [374]:
# access RNames references
url = 'http://rnames-staging.it.helsinki.fi/api/references/'
r = urlopen(url)
result= r.read().decode('utf-8')
data_RN_refs = json.loads(result)
ende = time.time()

print("download took:",(ende - start)/60, "mins")

download took: 0.3041657328605652 mins


In [390]:
# map PBDB references into RNames refs

# transfer to pandas dataframes
res_refs_RN = pd.DataFrame(data_RN_refs['results'])
res_refs_PB = pd.DataFrame(data_PB_refs['records'])
print(res_refs_PB)

#combine PBDB and RNames references
res_refs_RN = res_refs_RN[['id', 'first_author', 'year', 'title']]
res_refs_PB['first_author']=res_refs_PB['al1'].astype(str)+', '+res_refs_PB['ai1']
res_refs_PB = res_refs_PB[['oid', 'first_author', 'pby', 'tit']]
res_refs_PB.rename(columns={"oid": "id", "pby": "year", "tit": "title"})
res_refs = pd.concat([res_refs_RN.reset_index(drop=True), res_refs_PB], axis=0)

          oid                pty  \
0    ref:8850    journal article   
1   ref:33153    journal article   
2   ref:18155   serial monograph   
3   ref:42596    journal article   
4   ref:10010        unpublished   
5    ref:9569  book/book chapter   
6   ref:19210        unpublished   
7   ref:42499    journal article   
8   ref:47900               book   
9   ref:15443  book/book chapter   
10   ref:9098    journal article   
11   ref:5954               book   
12  ref:23788    journal article   
13  ref:42599    journal article   
14   ref:9184    journal article   
15  ref:30675    journal article   
16   ref:9006  book/book chapter   
17   ref:2237    journal article   
18  ref:18322    journal article   
19   ref:8933    journal article   
20  ref:40570    journal article   
21  ref:23736    journal article   
22  ref:33105    journal article   
23  ref:29103    journal article   
24  ref:42679    journal article   
25   ref:8851  book/book chapter   
26   ref:8960    journal art

In [377]:
# prepare Structured Names of PBDB
# transfer to pandas dataframes
res_strat_PB = pd.DataFrame(data_strata['records'])
res_intvl_PB = pd.DataFrame(data_intvl['records'])

res_strat_PB = res_strat_PB[['sgr', 'sfm', 'smb', 'eag', "lag"]]
res_intvl_PB = res_intvl_PB[['lvl','nam', 'eag', 'lag', 'rid']]

strat_gr = res_strat_PB[['sgr', 'eag', "lag"]]
strat_gr['Qualifier'] = "Group"
strat_fm = res_strat_PB[['sfm', 'eag', "lag"]]
strat_fm['Qualifier'] = "Formation" 
strat_mbr = res_strat_PB[['smb', 'eag', "lag"]]
strat_mbr['Qualifier'] = "Member" 
strat_gr.rename(columns={"sgr": "Name"},inplace = True)
strat_fm.rename(columns={"sfm": "Name"},inplace = True)
strat_mbr.rename(columns={"smb": "Name"},inplace = True)
res_strat_PB = pd.concat([strat_gr, strat_fm, strat_mbr], axis=0)
res_strat_PB = res_strat_PB.dropna()

intvl_eon = res_intvl_PB[res_intvl_PB['lvl']==1.0]
intvl_era = res_intvl_PB[res_intvl_PB['lvl']==2.0]
intvl_period = res_intvl_PB[res_intvl_PB['lvl']==3.0]
intvl_epoch = res_intvl_PB[res_intvl_PB['lvl']==4.0]
intvl_age = res_intvl_PB[res_intvl_PB['lvl']==5.0]
intvl_age_r = res_intvl_PB[res_intvl_PB['lvl'].isnull()]

intvl_eon['lvl'] = "Eon" 
intvl_era['lvl'] = "Era"
intvl_period['lvl'] = "Period"
intvl_epoch['lvl'] = "Epoch"
intvl_age['lvl'] = "Stage"
intvl_age_r['lvl'] = "Regio_Stage" 

res_intvl_PB = pd.concat([intvl_eon, intvl_era, intvl_period,
                         intvl_epoch, intvl_age, intvl_age_r], axis=0)
res_intvl_PB = res_intvl_PB.dropna()

res_intvl_PB.rename(columns={"nam": "Name", "lvl":"Qualifier", "rid":"Reference"},inplace = True)


        Qualifier              Name     eag     lag    Reference
1071          Eon       Phanerozoic   541.0     0.0  [ref:47900]
1106          Eon       Proterozoic  2500.0   541.0  [ref:47900]
1114          Eon           Archean  4000.0  2500.0  [ref:15443]
1118          Eon            Hadean  4600.0  4000.0   [ref:5954]
238           Era          Cenozoic    66.0     0.0  [ref:47900]
...           ...               ...     ...     ...          ...
1102  Regio_Stage          Huronian  1650.0  1650.0   [ref:5954]
1112  Regio_Stage     Early Imbrian  3850.0  1650.0   [ref:5954]
1113  Regio_Stage         Nectarian  3950.0  3850.0   [ref:5954]
1116  Regio_Stage  Basin Groups 1-9  4150.0  3950.0   [ref:5954]
1117  Regio_Stage           Cryptic  4560.0  4150.0   [ref:5954]

[1119 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org

In [386]:
# binning rule P1
# three time slice options period 3/ stage 5 / selected
time_slice_s = res_intvl_PB[res_intvl_PB.Qualifier == "Stage"]
time_slice_p = res_intvl_PB[res_intvl_PB.Qualifier == "Period"]
time_slice_a = res_intvl_PB.iloc[130:131]
time_slice=time_slice_a
#print(time_slice_a)

    Qualifier      Name    eag    lag   Reference
568     Stage  Asselian  298.9  295.5  [ref:5954]


In [387]:
start = time.time()
time_slice=time_slice_a
time_slice=time_slice.sort_values('eag')
# name | oldest | youngest | time slices | reference
for i in range(0,len(time_slice)):
    eab = time_slice['eag'].iloc[i]
    lab = time_slice['lag'].iloc[i]
    ts = time_slice['Name'].iloc[i]
    x_strat = res_strat_PB[(res_strat_PB['eag']<=eab) & (res_strat_PB['lag']>=lab)]
    x_intvl = res_intvl_PB[(res_intvl_PB['eag']<=eab) & (res_intvl_PB['lag']>=lab)]
    x_strat['Oldest'] = ts
    x_strat['Youngest'] = ts
    x_intvl['Oldest'] = ts
    x_intvl['Youngest'] = ts
    if i==0:
        strat_bin_raw = x_strat
        intvl_bin_raw = x_intvl
    else:
        strat_bin_raw = strat_bin_raw.append(x_strat,ignore_index=True)
        intvl_bin_raw = intvl_bin_raw.append(x_intvl,ignore_index=True)
        
strat_bin_raw =  strat_bin_raw[~(strat_bin_raw['Name']==strat_bin_raw['Oldest'])]
intvl_bin_raw =  intvl_bin_raw[~(intvl_bin_raw['Name']==intvl_bin_raw['Oldest'])]
strat_bin_raw['Reference'] = "Paleobiology Database"
PBDB_bin_raw = intvl_bin_raw.append(strat_bin_raw,ignore_index=True)
PBDB_bin_raw =  PBDB_bin_raw.drop_duplicates(subset=['Qualifier','Name','Oldest'], keep="first")
#print(PBDB_bin_raw)

ende = time.time()
print("first loop took:",(ende - start)/60, "mins")

#
PBDB_bin_raw_dup = PBDB_bin_raw[PBDB_bin_raw.duplicated(['Name'])]
PBDB_bin_raw_nondup = PBDB_bin_raw[~(PBDB_bin_raw.duplicated(['Name']))]
namelist = PBDB_bin_raw_dup.Name.unique()
#print(PBDB_bin_raw_nondup)

start = time.time()
for i in range(0,len(namelist)):
    x_PBDB_bin_dup = PBDB_bin_raw_dup[PBDB_bin_raw_dup['Name']== namelist[i]]
    x_bin = x_PBDB_bin_dup.Oldest.unique()
    youngest = x_bin[0]
    oldest = x_bin[len(x_bin)-1]
    x_PBDB_bin_dup['Oldest'] = oldest
    x_PBDB_bin_dup['Youngest'] = youngest
    if i==0:
        PBDB_bin_dup_c = x_PBDB_bin_dup
    else:
        PBDB_bin_dup_c = PBDB_bin_dup_c.append(x_PBDB_bin_dup,ignore_index=True)

ende = time.time()
print("second loop took:",(ende - start)/60, "mins")
        
PBDB_bin_dup_c =  PBDB_bin_dup_c.drop_duplicates(subset=['Youngest','Name','Oldest'], keep="first")
PBDB_binned_raw = PBDB_bin_dup_c.append(PBDB_bin_raw_nondup,ignore_index=True)
PBDB_binned = PBDB_binned_raw[['Name', 'Qualifier','Oldest', 'Youngest','Reference']]
print(PBDB_binned)

first loop took: 0.0003879268964131673 mins
second loop took: 0.00033446947733561195 mins
                         Name  Qualifier    Oldest  Youngest  \
0                       Dorud  Formation  Asselian  Asselian   
1                     Itararé  Formation  Asselian  Asselian   
2                 Slavyanskoy  Formation  Asselian  Asselian   
3                  Cottonwood     Member  Asselian  Asselian   
4                     Florena     Member  Asselian  Asselian   
..                        ...        ...       ...       ...   
163                      Howe     Member  Asselian  Asselian   
164  Upper Marietta Sandstone     Member  Asselian  Asselian   
165             Lontras Shale     Member  Asselian  Asselian   
166        1. Flöz (1st seam)     Member  Asselian  Asselian   
167                        M6     Member  Asselian  Asselian   

                 Reference  
0    Paleobiology Database  
1    Paleobiology Database  
2    Paleobiology Database  
3    Paleobiology Databas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = va

In [389]:
# access RNames references
url = 'https://rnames-staging.it.helsinki.fi/api/relations/'
r = urlopen(url)
result= r.read().decode('utf-8')
data_RN_rels = json.loads(result)
res_rels_RN = pd.DataFrame(data_RN_rels['results'])
print(res_rels_RN)

      id  belongs_to                                           name_one  \
0  22437           0  http://rnames-staging.it.helsinki.fi/api/struc...   
1  22436           0  http://rnames-staging.it.helsinki.fi/api/struc...   
2  22432           0  http://rnames-staging.it.helsinki.fi/api/struc...   
3  22439           0  http://rnames-staging.it.helsinki.fi/api/struc...   
4  22438           0  http://rnames-staging.it.helsinki.fi/api/struc...   
5  22430           0  http://rnames-staging.it.helsinki.fi/api/struc...   
6  22444           0  http://rnames-staging.it.helsinki.fi/api/struc...   
7  22446           0  http://rnames-staging.it.helsinki.fi/api/struc...   
8  22445           0  http://rnames-staging.it.helsinki.fi/api/struc...   
9  22443           0  http://rnames-staging.it.helsinki.fi/api/struc...   

                                            name_two  \
0  http://rnames-staging.it.helsinki.fi/api/struc...   
1  http://rnames-staging.it.helsinki.fi/api/struc...   
2  htt