In [1]:
#basic package
import json
import tqdm
import os
import numpy as np
import glob
import pandas as pd
import sys
import shutil
import pickle
import collections
from collections import Counter
import re

In [2]:
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
path_data = os.path.join(ROOT_DIR,'datasets')

# Andrew list of species and synonymes

### download data

In [3]:
#to know which encoding it is:
#with open(os.path.join(path_data, 'synonyms/trd18-splitted-synonyms_simpler.csv')) as f:
#    print(f)
df = pd.read_csv(os.path.join(path_data, 'synonyms/trd18-splitted-synonyms_simpler.csv'), 
                 encoding='latin-1' )
print(df.shape)
df.head(3)

(3730, 2)


Unnamed: 0,binomial,split2
0,Hydrophis peronii,"c(""Acalyptus Peronii"", ""? Acalyptus"", ""Acalypt..."
1,Acanthophis antarcticus,"c(""Boa antarctica"", ""Acanthophis cerastinus"", ..."
2,Acanthophis hawkei,"c(""Acanthophis hawkei"", ""Acanthophis cummingi""..."


### cleaning

In [4]:
#small example
#[x.strip().strip('\"').strip('?').strip() for x in df.iloc[1]['split2'].strip('c(').strip(')').split(',')]

In [5]:
def clean_syn(x):
    
    #small cleaning
    li = [i.strip().strip('\"').strip('?').strip() for i in x.strip('c(').strip(')').split(',')]
    
    #enlver ce quil y a entre parenthèse
    li = [re.sub(r'\([^)]*\)', '', i) for i in li ]
    
    #remove NA or empty syn
    li = [i for i in li if i not in [np.nan, 'NA', '', None]]
    return(list(set(li)))

In [6]:
df['split2_clean'] = df['split2'].map(lambda x: clean_syn(x))

In [7]:
df.head(3)

Unnamed: 0,binomial,split2,split2_clean
0,Hydrophis peronii,"c(""Acalyptus Peronii"", ""? Acalyptus"", ""Acalypt...","[Pseudodisteira horrida, Hydrophis peronii, Ac..."
1,Acanthophis antarcticus,"c(""Boa antarctica"", ""Acanthophis cerastinus"", ...","[Vipera sorda, Acanthophis brownii, Ophryas ac..."
2,Acanthophis hawkei,"c(""Acanthophis hawkei"", ""Acanthophis cummingi""...","[Acanthophis hawkei, Acanthophis cummingi]"


### remove syn that equal its species name

In [8]:
df['li_synonyms'] = df.apply(lambda x: [i for i in x['split2_clean'] if i!=x['binomial']], axis=1)

### remove synonyms that is already a name of a species (keeping list)

In [9]:
li_species = df['binomial'].tolist()
df['li_synonyms_clean'] = df['li_synonyms'].map(lambda x: [i for i in x if i not in li_species])

In [10]:
df['syn_equal_other_species'] = df.apply(lambda x: [i for i in x['li_synonyms'] if i not in x['li_synonyms_clean']],
                                              axis=1)

In [11]:
#save list of species which was once used as synonymes for another species
li_species_was_syn_of_other_species = df['syn_equal_other_species'].tolist()
li_species_was_syn_of_other_species = list(set([i for x in li_species_was_syn_of_other_species for i in x if len(x)>0]))
print('There is %d species which names was use as a synonyme for another species at least once'%len(li_species_was_syn_of_other_species))
li_species_was_syn_of_other_species
pickle.dump(li_species_was_syn_of_other_species, open(os.path.join(path_data,'li_species_was_syn_of_other_species.pkl'), 'wb'))

There is 491 species which names was use as a synonyme for another species at least once


In [12]:
#for example one can see that a synonyme of Acanthophis pyrrhus is Acanthophis antarcticus, which is a name of 
#another species.

### remove synonymes that are in used for multiple species

In [13]:
li_syn = df['li_synonyms_clean'].tolist()
li_syn = [i for x in li_syn for i in x if len(x)>0]
len(li_syn)

10976

In [14]:
#create a list of synonymes that appeared in several species, then remove it 
c = Counter(li_syn)
li_syn_removed = [k for k,v in c.items() if v>1]
len(li_syn_removed)

816

In [15]:
df['li_synonyms_final'] = df['li_synonyms_clean'].map(lambda x: [i for i in x if i not in li_syn_removed])

In [16]:
df

Unnamed: 0,binomial,split2,split2_clean,li_synonyms,li_synonyms_clean,syn_equal_other_species,li_synonyms_final
0,Hydrophis peronii,"c(""Acalyptus Peronii"", ""? Acalyptus"", ""Acalypt...","[Pseudodisteira horrida, Hydrophis peronii, Ac...","[Pseudodisteira horrida, Acalyptus Peronii, Ac...","[Pseudodisteira horrida, Acalyptus Peronii, Ac...",[],"[Pseudodisteira horrida, Acalyptus Peronii, Ac..."
1,Acanthophis antarcticus,"c(""Boa antarctica"", ""Acanthophis cerastinus"", ...","[Vipera sorda, Acanthophis brownii, Ophryas ac...","[Vipera sorda, Acanthophis brownii, Ophryas ac...","[Vipera sorda, Acanthophis brownii, Ophryas ac...",[],"[Vipera sorda, Acanthophis brownii, Ophryas ac..."
2,Acanthophis hawkei,"c(""Acanthophis hawkei"", ""Acanthophis cummingi""...","[Acanthophis hawkei, Acanthophis cummingi]",[Acanthophis cummingi],[Acanthophis cummingi],[],[Acanthophis cummingi]
3,Acanthophis praelongus,"c(""Acanthophis praelongus"", ""Acanthophis prael...",[Acanthophis praelongus],[],[],[],[]
4,Acanthophis pyrrhus,"c(""Acanthophis pyrrhus"", ""Acanthophis antarcti...","[Acanthophis pyrrhus, Acanthophis antarcticus,...","[Acanthophis antarcticus, Aggressiserpens arms...","[Aggressiserpens armstrongi, Aggressiserpens p...",[Acanthophis antarcticus],"[Aggressiserpens armstrongi, Aggressiserpens p..."
5,Acanthophis wellsi,"c(""Acanthophis wellsei"", ""Acanthophis wellsi"",...","[Aggressiserpens wellsi, Acanthophis wellsi, A...","[Aggressiserpens wellsi, Acanthophis wellsei]","[Aggressiserpens wellsi, Acanthophis wellsei]",[],"[Aggressiserpens wellsi, Acanthophis wellsei]"
6,Achalinus ater,"c(""Achalinus niger"", ""Achalinus ater"", ""Achali...","[Achalinus ater, Achalinus niger]",[Achalinus niger],[],[Achalinus niger],[]
7,Achalinus formosanus,"c(""Achalinus formosanus"", ""Achalinopsis sauter...","[Achalinus formosanus, Achalinopsis sauteri]",[Achalinopsis sauteri],[Achalinopsis sauteri],[],[Achalinopsis sauteri]
8,Achalinus hainanus,"c(""Achalinus hainanus"", ""Achalinus hainanus"", ...",[Achalinus hainanus],[],[],[],[]
9,Achalinus jinggangensis,"c(""Achalinopsis jinggangensis"", ""Achalinus jin...","[Achalinopsis jinggangensis, Achalinus jinggan...",[Achalinopsis jinggangensis],[Achalinopsis jinggangensis],[],[Achalinopsis jinggangensis]


In [17]:
#save it
df.to_csv(os.path.join(path_data, 'synonyms/df_species_syn_andrew.csv'),index=False,sep=',')

# language translation as synonymes (from wikipedia)

### download data

In [18]:
f = glob.glob(os.path.join(path_data,'wikipedia','df_species_language.csv'))
if len(f)>0:
    df_wiki_syn = pd.read_csv(f[0],sep=';')
else:
    print('NOE LANGUAGE TRANSLATION YET, CAN NOT RUN BELOW')
    sys.exit()

In [19]:
print(df_wiki_syn.shape)
df_wiki_syn.head(3)

(1757, 144)


Unnamed: 0,Afrikaans,Albanian,Alemannisch,Arabic,Aragonese,Armenian,Arpitan,Assamese,Asturian,Aymara,...,Waray,Welsh,Western Frisian,Western Mari,Western Punjabi,Xhosa,Zazaki,pageid,species,title
0,,,,,,,,,,,...,Acalyptophis peronii,,,,اکالیپٹوفس پیرونی,,,26760494,Hydrophis peronii,Hydrophis peronii
1,,,,,,,,,,,...,Acanthophis antarcticus,,,,موتوالا سپ,,,6166502,Acanthophis antarcticus,Common death adder
2,,,,,,,,,,,...,Acanthophis hawkei,,,,,,,49485486,Acanthophis hawkei,Acanthophis hawkei


### preprocessing

#### add variable: list of unique language

In [20]:
li_language = [l for l in df_wiki_syn.columns if l not in ['species','title','pageid']]
print('%d language was found in the wikipedia research'%len(li_language))
df_wiki_syn['li_lang_syn'] = df_wiki_syn.apply(lambda x: [l for l in set([x[i] for i in li_language]) if str(l)!='nan'],
                                               axis=1)

141 language was found in the wikipedia research


In [21]:
df_wiki_syn.head(3)

Unnamed: 0,Afrikaans,Albanian,Alemannisch,Arabic,Aragonese,Armenian,Arpitan,Assamese,Asturian,Aymara,...,Welsh,Western Frisian,Western Mari,Western Punjabi,Xhosa,Zazaki,pageid,species,title,li_lang_syn
0,,,,,,,,,,,...,,,,اکالیپٹوفس پیرونی,,,26760494,Hydrophis peronii,Hydrophis peronii,"[Hydrophis peronii, اکالیپٹوفس پیرونی, Acalypt..."
1,,,,,,,,,,,...,,,,موتوالا سپ,,,6166502,Acanthophis antarcticus,Common death adder,"[Гадюкообразная смертельная змея, Vipère de la..."
2,,,,,,,,,,,...,,,,,,,49485486,Acanthophis hawkei,Acanthophis hawkei,[Acanthophis hawkei]


In [22]:
#remove from this list the syn beign the species itself
df_wiki_syn['li_lang_syn'] = df_wiki_syn.apply(lambda x: [i for i in x['li_lang_syn'] if i!=x['species']], axis=1)

In [23]:
li_lang_syn = [j for i in df_wiki_syn['li_lang_syn'].tolist() for j in i]
len(li_lang_syn)

5731

In [24]:
c = Counter(li_lang_syn)
li_multiple_times = [k for k,v in c.items() if v>1]
print(len(li_multiple_times))

44


In [27]:
#remove from this list the syn appearing in other species as well
df_wiki_syn['li_lang_syn'] = df_wiki_syn['li_lang_syn'].map(lambda x: [i for i in x if i not in li_multiple_times])

In [28]:
df_wiki_syn.head(3)

Unnamed: 0,Afrikaans,Albanian,Alemannisch,Arabic,Aragonese,Armenian,Arpitan,Assamese,Asturian,Aymara,...,Welsh,Western Frisian,Western Mari,Western Punjabi,Xhosa,Zazaki,pageid,species,title,li_lang_syn
0,,,,,,,,,,,...,,,,اکالیپٹوفس پیرونی,,,26760494,Hydrophis peronii,Hydrophis peronii,"[اکالیپٹوفس پیرونی, Acalyptophis peronii, مار ..."
1,,,,,,,,,,,...,,,,موتوالا سپ,,,6166502,Acanthophis antarcticus,Common death adder,"[Гадюкообразная смертельная змея, Vipère de la..."
2,,,,,,,,,,,...,,,,,,,49485486,Acanthophis hawkei,Acanthophis hawkei,[]


# synonyms from pdf book

In [22]:
#TODO

# dico of all synonymes (keys=species, values=list of syn)

### from andrew list

In [29]:
dico_species_lisyn = dict(zip(df['binomial'], df['li_synonyms_final']))

### from wiki languages

In [31]:
dico_species_lisyn_ = dico_species_lisyn.copy()
for k,v in dico_species_lisyn.items():
    d = df_wiki_syn[df_wiki_syn['species']==k]
    #if it has one line in teh wiki dataframe 
    if d.shape[0]==1:
        dico_species_lisyn_[k] = list(set(v+d['li_lang_syn'].values[0]))
        #if len(dico_species_lisyn_[k])>len(v):
        #    x1 = len(dico_species_lisyn_[k])-len(v)
        #    print('We improved of %d synonymes for species %s thanks to wiki language'%(x1,k))
    if d.shape[0]>1:
        print('species %s have more than one lin ein the wiki df, check why'%k)

In [35]:
pickle.dump(dico_species_lisyn_, open(os.path.join(path_data,'synonyms','dico_species_lisyn_.pkl'), 'wb'))

In [38]:
#load to inspect
#dico_species_lisyn__ = pickle.load(open(os.path.join(path_data,'synonyms','dico_species_lisyn_.pkl'), 'rb'))

In [None]:
#TODO: 
#flickr,herpmapper,inaturalist in other language as well, and see how much new data it does.
#Andrew meeting
#reunited_all_datasource_for_dl
#README

#pdf
#wiki automatically add image?
#algo (look at what left in NIH first etc)