## Jupyter notebook source
https://colab.research.google.com/drive/1KDppE8ccqMBgdz5HjP_wi7DO4C4qAOAe?usp=sharing

## Install libraries

In [24]:
%%capture
!pip install wikidataintegrator
!pip install ipyplot

## Import libraries

In [25]:
from google.colab import files
import io
import pandas as pd
from wikidataintegrator import wdi_core
import requests
import json
import ipyplot

## import csv from nederlandsesoorten.nl
When browsing the species registry there is a download option to get the csv export of the currently. This notebook uses this export. As an example: You can download the list of [invasive insects (flies & musquitos](https://www.nederlandsesoorten.nl/linnaeus_ng/app/views/search/nsr_search_extended.php?group_id=145015&author_id=&group=Diptera++-+Vliegen+en+muggen+%5Borde%5D&author=&presence%5B2%5D=on&presence%5B6%5D=on&presence%5B3%5D=on&presence%5B5%5D=on&presence%5B4%5D=on&sort=name-valid). Click on "Exporteer als CSV" and upload the download file through upload form below


In [27]:
uploaded = files.upload()
for file in uploaded.keys():
  df2 = pd.read_csv(io.BytesIO(uploaded[file]),sep="\t",skiprows=1)
# Dataset is now stored in a Pandas Dataframe

Saving Export-20220329-125627.csv to Export-20220329-125627.csv


## remove the authornames from the taxon names

In [23]:
values = []
for species in df2["wetenschappelijke naam"].tolist():
  values.append("\""+" ".join(species.split()[:2])+"\"")

## Split the list of taxon name in managable chunks for further processing

In [17]:
chunks = [values[x:x+30] for x in range(0, len(values), 30)]

## Identify missing Wikipedia articles for specieslist uploaded

In [18]:

missing_list = []

for values in chunks: 
  query = """SELECT * WHERE {
              VALUES ?taxonLabel { 
                """
  query += " ".join(values) 
  query += """     }
    ?taxon wdt:P225 ?taxonLabel .
    OPTIONAL {?commons schema:about ?taxon ;
                schema:isPartOf <https://commons.wikimedia.org/> .}
    OPTIONAL {?taxon wdt:P3151 ?inaturalist .}
    OPTIONAL {?taxon wdt:P846 ?gbif .}
    OPTIONAL {?taxon wdt:P3405 ?nlsoortid .}
    OPTIONAL {?taxon wdt:P18 ?image .}
    FILTER NOT EXISTS {?article schema:about ?taxon ;
                      schema:isPartOf <https://en.wikipedia.org/>}
    BIND (URI(CONCAT("https://andrawaag.github.io/tarsier/jupyter_link.html?name=", ?taxonLabel)) as ?tarsier)
  }
  """
  missing_list.append(wdi_core.WDFunctionsEngine.execute_sparql_query(query, as_dataframe=True))

## Identify missing taxa in Wikidata

In [19]:
missing_wikidata_list = []
for values in chunks:
  query = """SELECT * WHERE {
              VALUES ?taxonLabel { 
                """
  query += " ".join(values) 
  query += """     }
    FILTER NOT EXISTS {?taxon wdt:P225 ?taxonLabel .}
  }
  """
  missing_wikidata_list.append(wdi_core.WDFunctionsEngine.execute_sparql_query(query, as_dataframe=True))
missing_articles = pd.concat(missing_list)

## Find references for the species under scrutiny

In [20]:
import json
references_dict = dict()
for index, row in missing_articles.iterrows():
  if not pd.isnull(row["gbif"]):
    references = json.loads(requests.get(f"https://api.gbif.org/v1/species/{row['gbif']}/references").text)
    for reference in references["results"]:
      if row['gbif'] not in references_dict.keys():
        references_dict[row['gbif']] = []
      references_dict[row['gbif']].append(reference["citation"])
    if row['gbif'] in references_dict.keys():
      missing_articles.at[index,"gbif_references"] = "# ".join(references_dict[row['gbif']])

## Find reusable images through GBIF


In [21]:
images = dict()
for index, row in missing_articles.iterrows():
  if not pd.isnull(row["gbif"]):
    gbifid = row['gbif']

    if gbifid not in images.keys():
      images[gbifid] = dict()
      
    datasets = json.loads(requests.get("https://api.gbif.org/v1/occurrence/counts/datasets?taxonKey="+gbifid).text)
    for datasetKey in datasets.keys():
      gbifdatasets = json.loads(requests.get("https://api.gbif.org/v1/occurrence/search?taxonKey="+gbifid+"&license=CC0_1_0&mediaType=StillImage&datasetKey="+datasetKey).text)
      if gbifdatasets["count"] >0:
        for image_collection in gbifdatasets["results"]:
          for media in image_collection["media"]:
            hostingorg = json.loads(requests.get("https://api.gbif.org/v1/organization/"+image_collection["hostingOrganizationKey"]).text)['title']
            if "license" in media.keys():
              if hostingorg not in images[gbifid].keys():
                images[gbifid][hostingorg] = dict()
              images[gbifid][hostingorg][media["identifier"]] = media["license"]
      gbifdatasets = json.loads(requests.get("https://api.gbif.org/v1/occurrence/search?taxonKey="+gbifid+"&license=CC_BY_4.0&mediaType=StillImage&datasetKey="+datasetKey).text)
      if gbifdatasets["count"] >0:
        for image_collection in gbifdatasets["results"]:
          for media in image_collection["media"]:
            hostingorg = json.loads(requests.get("https://api.gbif.org/v1/organization/"+image_collection["hostingOrganizationKey"]).text)['title']
            if "license" in media.keys():
              if hostingorg not in images[gbifid].keys():
                images[gbifid][hostingorg] = dict()
              images[gbifid][hostingorg][media["identifier"]] = media["license"]

for gbifid in images.keys():
  for collection in images[gbifid].keys():
    temp_images = []
    temp_labels = []
    for url in images[gbifid][collection].keys():
      temp_images.append(url)
      temp_labels.append(images[gbifid][collection][url])
    images_df = pd.DataFrame()
    images_df['images'] = temp_images
    images_df['labels'] = temp_labels
    print(str(missing_articles[missing_articles["gbif"]==gbifid]["taxonLabel"]))
    ipyplot.plot_images(images_df['images'], labels=images_df['labels'], max_images=10, )

0    Dohrniphora cornuta
Name: taxonLabel, dtype: object


0    Dohrniphora cornuta
Name: taxonLabel, dtype: object


1    Ceratitis cosyra
Name: taxonLabel, dtype: object


2    Atypophthalmus umbratus
Name: taxonLabel, dtype: object


3    Horidiplosis ficifolii
Name: taxonLabel, dtype: object


6    Aedes flavopictus
Name: taxonLabel, dtype: object


7    Dacus siliqualactis
Name: taxonLabel, dtype: object


0    Merodon avidus
Name: taxonLabel, dtype: object


0    Merodon avidus
Name: taxonLabel, dtype: object


0    Merodon avidus
Name: taxonLabel, dtype: object


1    Pogonosoma maroccanum
Name: taxonLabel, dtype: object


1    Pogonosoma maroccanum
Name: taxonLabel, dtype: object


2    Obolodiplosis robiniae
Name: taxonLabel, dtype: object


2    Obolodiplosis robiniae
Name: taxonLabel, dtype: object


2    Obolodiplosis robiniae
Name: taxonLabel, dtype: object


2    Obolodiplosis robiniae
Name: taxonLabel, dtype: object


3    Lamprolonchaea smaragdi
Name: taxonLabel, dtype: object


3    Lamprolonchaea smaragdi
Name: taxonLabel, dtype: object


3    Lamprolonchaea smaragdi
Name: taxonLabel, dtype: object


4    Leia arsona
Name: taxonLabel, dtype: object


5    Telostylinus lineolatus
6    Telostylinus lineolatus
Name: taxonLabel, dtype: object


5    Telostylinus lineolatus
6    Telostylinus lineolatus
Name: taxonLabel, dtype: object


5    Telostylinus lineolatus
6    Telostylinus lineolatus
Name: taxonLabel, dtype: object


7    Merodon cinereus
Name: taxonLabel, dtype: object


10    Megaselia gregaria
Name: taxonLabel, dtype: object


15    Proceroplatus trinidadensis
Name: taxonLabel, dtype: object


16    Sciophila fractinervis
Name: taxonLabel, dtype: object


## Download results

In [22]:
pd.DataFrame.from_dict(images).to_csv('gbif_images.csv')
files.download('gbif_images.csv')
result = pd.concat(missing_wikidata_list)
result.to_csv('missing_in_wikidata.csv')
files.download('missing_in_wikidata.csv')
missing_articles.to_csv('missing_in_wikipedia.csv')
files.download('missing_in_wikipedia.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>