The final purpose of this file is to obtain for each plant species the regions in which it is native and in which it is non-native.

To do so, first we manually download from https://powo.science.kew.org the dataset containing metadata related to all the species.
Then we extract the ids of the plants we are interested in and from those we can obtain the native and non-native regions.

In [1]:
import pandas as pd
import os

In [2]:
#this file contains the name of the species and initial suggestions for their regions. We only need it for the species list
species_list_location = './support_files/species_list_lythrum.csv'

#these two contain the name of the species and their id, and their location distribution.
#for more information refer to the README file in the support files inside wcvp folder
wcvp_names_location = "./support_files/wcvp/wcvp_names.csv"
wcvp_distribution_location = "./support_files/wcvp/wcvp_distribution.csv"



#choosing the path to save the files
save_name_list_path = './support_files/invasive_plants_name_list.csv'
plant_distribution_path = './support_files/plant_distribution.csv'

In [3]:
#Since the file containing the species list has a particular format, this function takes care of the preprocessing

def extract_names(species_list_location, save_name_list_path):
    """
    Take in input the location of the file containing the species list. Preprocess it and save the list to a DataFrame
    """

    species = pd.read_csv(species_list_location, sep=',', on_bad_lines='warn', usecols=[0])

    #only keep the first two parts of the taxonomy
    #e.g. from "Lythrum borysthenicum (M.Bieb. ex Schrank) Litv." only keep "Lythrum borysthenicum"
    species['Species'] = species['Species'].apply(lambda x : x.split()[:2])

    #from the list of words unite them and lowercase them
    species['Species'] = species['Species'].apply(lambda x : " ".join(x).lower())

    #save it to csv to the desired path
    species.to_csv(save_name_list_path, index=False)

    return species

In [4]:
species = extract_names(species_list_location, save_name_list_path)

species

Unnamed: 0,Species
0,lythrum acutangulum
1,lythrum alatum
2,lythrum album
3,lythrum americanum
4,lythrum anatolicum
5,lythrum baeticum
6,lythrum borysthenicum
7,lythrum bryantii
8,lythrum californicum
9,lythrum curtissii


Now I have the list of names in lowercase

From this list of names I will extract the ids of the plants from the Kew database (https://powo.science.kew.org), "Plants of the World Online", especialli WCVP, "World Checklist of Vascular Plants", through the file "wcvp_names".

With the ids of the plants, I can check if they are native or non native for the different locations throught the file "wcvp_distribution"

In [5]:
def extract_ids(wcvp_names_location, name_list_df):
    """
    From the file containing the metadata regarding the species name, extract the id related to each species.
    Merge it to the dataframe with the names and return a dataframe containing the name and the id (e.g. 'lythrum salicaria', 2354139)
    """

    wcvp_species_df = pd.read_csv(wcvp_names_location, sep='|')

    #only keep species with accepted taxonomy, others might be "unofficial" names and therefore not reliable
    accepted_df = wcvp_species_df[wcvp_species_df['taxon_status']=='Accepted'].copy()
    accepted_df['taxon_name'] = accepted_df['taxon_name'].str.lower()

    #merge the dataframe with the list of names with the one containing the id related to the name
    name_list_id = name_list_df.merge(accepted_df[['plant_name_id','taxon_name']], how='left', left_on='Species', right_on='taxon_name')

    
    return name_list_id[['Species', 'plant_name_id']]

In [6]:
def extract_locations(name_list_id, wcvp_distribution_location):

    """
    From the file containing the location for which each species is native or introduced, group them all together.
    Return a dataframe containing the species taxon (e.g. lythrum salicaria), the id, the list of regions for which
    it is native and for which it is introduced.
    """

    wcvp_distribution_df = pd.read_csv(wcvp_distribution_location, sep='|')

    #the distribution dataframe is quite big so we need to filter it down a bit
    wcvp_distribution_df = wcvp_distribution_df[['plant_name_id', 'area', 'introduced']]

    wcvp_distribution_df['status'] = wcvp_distribution_df['introduced'].map({0: 'native', 1: 'introduced'})


    grouped_distribution = wcvp_distribution_df.groupby(['plant_name_id', 'status'])['area'].agg(list).unstack(fill_value=[]).reset_index()

    #after groupby we have something like
#     plant_name_id | status     | area
# -----------------------------------
# 1             | native     | North America
# 1             | native     | South America
# 1             | introduced | Europe

#with ['area'].agg(list) we aggregate the values of the 'area' column for each group into a list, so we have

# plant_name_id | status     | area
# -----------------------------------------------
# 1             | native     | ['North America', 'South America']
# 1             | introduced | ['Europe']

#with unstack(fill_value=[]) we pivot the status values into columns, so we have two columns "introduced" and "native"
#each row still corresponds to a specific plant_name_id

# plant_name_id | introduced       | native
# ----------------------------------------------
# 1             | ['Europe']       | ['North America', 'South America']

#.reset_index() flattens the dataframe, because without it plant_name_id is the index, after it plant_name_id becomes a normal column again


    #finally we merge the dataframe with plant names and plant ids with the dataframe with ids and countries

    final_plant_distribution = name_list_id.merge(grouped_distribution, on='plant_name_id', how='left')

    return final_plant_distribution

In [7]:
def extract_plant_zones(species_list_location, wcvp_distribution_location, wcvp_names_location, save_name_list_path, plant_distribution_path):

    """
    Pipeline that puts together all the previous functions.
    
    Extract the zones for which each species of plant is native or non-native.
    Return a dataframe containing the species taxon (e.g. lythrum salicaria), the id, the list of regions for which
    it is native and for which it is introduced.
    """

    #if the file with the species list already exists, import it. Otherwise create it and return the dataframe
    if os.path.isfile(save_name_list_path):
        name_list_df = pd.read_csv(save_name_list_path)
    else:
        name_list_df = extract_names(species_list_location, save_name_list_path)

    #link the ids of the plants to their names
    name_list_id = extract_ids(wcvp_names_location, name_list_df)


    #obtain their location
    final_plant_distribution = extract_locations(name_list_id, wcvp_distribution_location)

    final_plant_distribution.to_csv(plant_distribution_path, index=False)

    return final_plant_distribution

In [8]:
#run the final function

final_plant_distribution = extract_plant_zones(species_list_location, wcvp_distribution_location, wcvp_names_location, save_name_list_path, plant_distribution_path)

final_plant_distribution

  wcvp_species_df = pd.read_csv(wcvp_names_location, sep='|')


Unnamed: 0,Species,plant_name_id,introduced,native
0,lythrum acutangulum,2353976,[],"[France, Spain, Algeria, Morocco]"
1,lythrum alatum,2353973,[],"[Ontario, Colorado, Wyoming, Illinois, Iowa, K..."
2,lythrum album,2353969,[],"[Mexico Central, Mexico Northeast, Mexico Sout..."
3,lythrum americanum,2353966,[],[Mexico Gulf]
4,lythrum anatolicum,2353965,[],[Türkey]
5,lythrum baeticum,2353968,[],"[Spain, Morocco]"
6,lythrum borysthenicum,2898190,[Azores],"[Hungary, Corse, France, Portugal, Sardegna, S..."
7,lythrum bryantii,2353990,[],[Mexico Northwest]
8,lythrum californicum,2353987,[],"[Kansas, Oklahoma, Arizona, California, Nevada..."
9,lythrum curtissii,2353922,[],"[Florida, Georgia]"
