# Toy Image Training
**Objective**

Demonstrate a training in small scale. In particular:
- 100 observations.
- 10 species.
- Images, only.

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from pathlib import Path
import tqdm
import pandas as pd

from src import load_names_csv, load_observations_csv, load_images_csv, load_image_observations_csv, get_names_pref_df, fetch_and_save_images

## Get Data

The data consists of a set of tables and images. The tables contain the image IDs, mappings to observations, and labels.

### Tables

In [3]:
names_df = load_names_csv()
names_pref_df = get_names_pref_df(names_df=names_df)

In [4]:
images_df = load_images_csv()
images_observations_df = load_image_observations_csv()

In [5]:
observations_df = load_observations_csv()

In [6]:
display(images_df.head())
display(images_observations_df.head())
display(observations_df.head())
names_pref_df.head()

Unnamed: 0,id,content_type,copyright_holder,license,ok_for_export,ok_for_ml
0,1,image/jpeg,Nathan Wilson,Creative Commons Wikipedia Compatible v3.0,1,1
1,2,image/jpeg,Nathan Wilson,Creative Commons Wikipedia Compatible v3.0,1,1
2,3,image/jpeg,Nathan Wilson,Creative Commons Wikipedia Compatible v3.0,1,1
3,4,image/jpeg,Nathan Wilson,Creative Commons Wikipedia Compatible v3.0,1,1
4,5,image/jpeg,Nathan Wilson,Creative Commons Wikipedia Compatible v3.0,1,1


Unnamed: 0,image_id,observation_id
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5


Unnamed: 0,id,name_id,when,location_id,lat,long,alt,vote_cache,is_collection_location,thumb_image_id
0,1,2,2004-07-13,214.0,,,,1.92335,1,1.0
1,2,3,2004-07-17,53.0,,,,2.70604,1,2.0
2,3,4,2002-01-08,60.0,,,,2.49991,1,3.0
3,4,4,1996-01-15,5.0,,,,2.49991,1,4.0
4,5,5,2002-12-28,36.0,,,,1.66661,1,5.0


Unnamed: 0,id,text_name,author,deprecated,correct_spelling_id,synonym_id,rank,preferred_id
0,1,Fungi,Bartl.,0,,9996.0,14,1
1,2,Xylaria polymorpha group,J.D. Rogers,0,,8975.0,16,2
2,3,Xylaria magnoliae,J.D. Rogers,0,,,4,3
3,4,Xylaria hypoxylon group,J.D. Rogers,0,,3692.0,16,4
4,5,Xeromphalina,Kühner & Maire,0,,6577.0,9,5


In [7]:
obs_mask = observations_df.is_collection_location == 1

names_pref_observations = pd.merge(
    left=observations_df[obs_mask][["id", "name_id", "vote_cache"]],
    right=names_pref_df[["id", "rank", "preferred_id"]],
    how="inner",
    left_on="name_id",
    right_on="id",
    suffixes=("_obs", "_names")
)

In [8]:
n = 10
min_vote_cache = 1.5
mask = names_pref_observations.vote_cache >= min_vote_cache
preferred_ids_df = names_pref_observations[mask].groupby("preferred_id").size().reset_index(name="Count").sort_values("Count", ascending=False).iloc[:10]
display(preferred_ids_df)
preferred_ids = preferred_ids_df["preferred_id"].values

Unnamed: 0,preferred_id,Count
6181,22603,4394
5782,20650,2998
8512,31029,2452
104,147,2160
613,864,2082
8513,31030,1875
195,273,1686
280,388,1665
10058,44467,1534
573,815,1531


In [9]:
# Test sampling
n_samples = 10
test_min_vote_cache = 2.5

mask = (names_pref_observations.preferred_id.isin(preferred_ids)) & (names_pref_observations.vote_cache >= test_min_vote_cache)
test_names_pref_observations = names_pref_observations[mask].sort_values("vote_cache", ascending=False).groupby("preferred_id").head(n_samples)
test_names_pref_observations

Unnamed: 0,id_obs,name_id,vote_cache,id_names,rank,preferred_id
50056,9886,20650,2.88427,20650,9,20650
170292,26831,44467,2.86955,44467,10,44467
45355,110578,147,2.86818,147,9,147
78462,12364,273,2.85358,273,9,273
24000,30458,22603,2.84792,22603,9,22603
...,...,...,...,...,...,...
215070,341108,31030,2.71382,31030,11,31030
9938,337451,31029,2.71281,31029,11,31029
9888,332629,31029,2.71011,31029,11,31029
214441,301731,31030,2.69521,31030,11,31030


In [10]:
test_names_pref_observations.name_id.unique()

array([20650, 44467,   147,   273, 22603, 31030,   388,   815,   864,
         209, 31029])

In [11]:
# Training sampling
n_samples = 100
train_min_vote_cache = 1.5

mask = (names_pref_observations.preferred_id.isin(preferred_ids)) & (names_pref_observations.vote_cache >= train_min_vote_cache) & (~names_pref_observations.id_obs.isin(test_names_pref_observations.id_obs))
train_names_pref_observations = names_pref_observations[mask].sort_values("vote_cache", ascending=False).groupby("preferred_id").head(n=n_samples)
train_names_pref_observations

Unnamed: 0,id_obs,name_id,vote_cache,id_names,rank,preferred_id
50446,49303,20650,2.78995,20650,9,20650
50321,28869,20650,2.78144,20650,9,20650
53885,425105,20650,2.76536,20650,9,20650
53048,330151,20650,2.76093,20650,9,20650
52287,238160,20650,2.75860,20650,9,20650
...,...,...,...,...,...,...
212135,131836,31030,2.54112,31030,11,31030
212928,203602,31030,2.54076,31030,11,31030
216147,400894,31030,2.54052,31030,11,31030
213765,264669,31030,2.54029,31030,11,31030


In [12]:
train_names_pref_observations.preferred_id.unique()

array([20650,   864, 22603,   273, 44467,   147,   388,   815, 31030,
       31029])

In [13]:
mask = (images_observations_df.observation_id.isin(train_names_pref_observations.id_obs)) | (images_observations_df.observation_id.isin(test_names_pref_observations.id_obs))
image_ids = images_observations_df[mask].image_id.values
img_mask = (images_df["id"].isin(image_ids)) & (images_df.ok_for_export == 1)
image_ids = images_df[img_mask]["id"].values

In [14]:
len(image_ids)

3873

### Images

In [15]:
DATA_FOLDER = Path().absolute() / "data"

In [16]:
errors = await fetch_and_save_images(image_ids=image_ids, size=320, image_folder=DATA_FOLDER)

[autoreload of src.utils failed: Traceback (most recent call last):
  File "/home/alan/miniconda3/envs/mo/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/alan/miniconda3/envs/mo/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/home/alan/miniconda3/envs/mo/lib/python3.7/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/home/alan/miniconda3/envs/mo/lib/python3.7/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 630, in _exec
  File "<frozen importlib._bootstrap_external>", line 728, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/home/alan/repos/mushroom_observer/notebooks/src/utils.py", line 6, in <module>
    from typing import List, Tuple, Dict, Union, Literal
ImportEr

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,