# Making training sample
## Author(s): Brian Nhan Thien Chung (UCI NATURE research technician)
### Created on: Wednesday September 9, 2020 by Brian Nhan Thien Chung
### Last edited on: Sunday September 13, 2020 by Brian Nhan Thien Chung
The purpose of this notebook is to try to make a training sample based on the species distribution of each camera. A proof of concept is conducted using animal photos from Anteater Trail. After this proof of concept is shown to be successful, a function is written to automate this process. This function exports csv files of animal photos that are part of the training sample of a camera.

In [14]:
import pandas as pd

In [1]:
%autosave 15

Autosaving every 15 seconds


In [16]:
relabeledPhotos = pd.read_csv("re-labeled animal photos.csv")
relabeledPhotos.head()

Unnamed: 0,Location,LocCode,ImageNumber,ImageQuality,Species,NumIndividuals,Date,Time,datetime,Notes,Counter
0,Anteater_1,AT,IMG_0025-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,14:25:00,2019-10-11 14:25:00,Orientation one:close to the road,1
1,Anteater_1,AT,IMG_0028-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,14:55:00,2019-10-11 14:55:00,,2
2,Anteater_1,AT,IMG_0031-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,18:24:00,2019-10-11 18:24:00,,3
3,Anteater_1,AT,IMG_0034-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,18:28:00,2019-10-11 18:28:00,,4
4,Anteater_1,AT,IMG_0039-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,18:37:00,2019-10-11 18:37:00,,5


In [17]:
species = relabeledPhotos.groupby("Species")["Species"].count()
species.index

Index(['bird', 'bird and human', 'bird and raccoon', 'coyote', 'dog and human',
       'domestic dog', 'horse', 'horse and human', 'human',
       'human + dog + rabbit', 'insect', 'lizard', 'mouse', 'opossum',
       'rabbit', 'rabbit and bird', 'rabbit and coyote', 'rabbit and human',
       'rabbit and unknown animal(s)', 'raccoon', 'rat', 'snake', 'squirrel',
       'unknown', 'vehicle', 'vehicle and human'],
      dtype='object', name='Species')

In [18]:
wildAnimalBool = relabeledPhotos["Species"].isin(["bird and raccoon", "rabbit and bird", "rabbit and coyote", "rabbit and unknown animal(s)", "human + dog + rabbit"])
relabeledPhotos.loc[wildAnimalBool, "Species"] = "wild animals"

animalAndHumanBool = relabeledPhotos["Species"].isin(["human + dog + rabbit", "rabbit and human", "bird and human"])
relabeledPhotos.loc[animalAndHumanBool, "Species"] = "animal and human"

relabeledPhotos = relabeledPhotos[relabeledPhotos["Species"] != "unknown"]

species = relabeledPhotos.groupby("Species")["Species"].count()
species.index

Index(['animal and human', 'bird', 'coyote', 'dog and human', 'domestic dog',
       'horse', 'horse and human', 'human', 'insect', 'lizard', 'mouse',
       'opossum', 'rabbit', 'raccoon', 'rat', 'snake', 'squirrel', 'vehicle',
       'vehicle and human', 'wild animals'],
      dtype='object', name='Species')

In [19]:
locCodeCounts = relabeledPhotos.groupby("LocCode")["LocCode"].count()
locCodeCounts

LocCode
AT    2914
BC      53
CB     138
CC       4
CH     265
CT    1642
EP      36
HB     693
MB    1433
MT    1410
RP     436
Name: LocCode, dtype: int64

## Section 1: proof of concept

In [20]:
anteater = relabeledPhotos[relabeledPhotos["LocCode"] == "AT"]

anteater.head()

Unnamed: 0,Location,LocCode,ImageNumber,ImageQuality,Species,NumIndividuals,Date,Time,datetime,Notes,Counter
0,Anteater_1,AT,IMG_0025-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,14:25:00,2019-10-11 14:25:00,Orientation one:close to the road,1
1,Anteater_1,AT,IMG_0028-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,14:55:00,2019-10-11 14:55:00,,2
2,Anteater_1,AT,IMG_0031-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,18:24:00,2019-10-11 18:24:00,,3
3,Anteater_1,AT,IMG_0034-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,18:28:00,2019-10-11 18:28:00,,4
4,Anteater_1,AT,IMG_0039-AT-2019-10-11-human,1.0,human,1.0,10/11/2019,18:37:00,2019-10-11 18:37:00,,5


In [21]:
anteaterSpeciesCounts = anteater.groupby("Species")["Species"].count()
anteaterSpeciesIndex = anteaterSpeciesCounts.index
print(anteaterSpeciesIndex)
for index in anteaterSpeciesIndex:
    if "human" in index:
        print(index)

Index(['animal and human', 'bird', 'coyote', 'dog and human', 'domestic dog',
       'human', 'insect', 'rabbit', 'raccoon', 'vehicle', 'wild animals'],
      dtype='object', name='Species')
animal and human
dog and human
human


In [22]:
print(anteaterSpeciesCounts)
print(anteaterSpeciesCounts["bird"])
anteaterTotal = anteater["Species"].count()
print(anteaterTotal)
print('\n')
anteaterSpeciesProportions = 100*anteaterSpeciesCounts/anteaterTotal
print(anteaterSpeciesProportions)

Species
animal and human      11
bird                  41
coyote               154
dog and human        110
domestic dog          16
human               1172
insect                 4
rabbit              1397
raccoon                2
vehicle                3
wild animals           4
Name: Species, dtype: int64
41
2914


Species
animal and human     0.377488
bird                 1.407001
coyote               5.284832
dog and human        3.774880
domestic dog         0.549073
human               40.219629
insect               0.137268
rabbit              47.940975
raccoon              0.068634
vehicle              0.102951
wild animals         0.137268
Name: Species, dtype: float64


In [23]:
sampleSize = int(0.10*anteaterTotal)
anteaterSample = anteater.sample(sampleSize)
print(anteaterSample)

        Location LocCode                           ImageNumber  ImageQuality  \
2167  Anteater_2      AT  IMG_0362-AT-2020-02-11-dog and human           1.0   
1591  Anteater_2      AT         IMG_1537-AT-2020-01-26-rabbit           1.0   
1953  Anteater_2      AT         IMG_2887-AT-2020-02-05-rabbit           1.0   
157   Anteater_2      AT         IMG_0915-AT-2019-10-21-coyote           2.0   
96    Anteater_1      AT          IMG_0522-AT-2019-10-16-human           1.0   
...          ...     ...                                   ...           ...   
2133  Anteater_2      AT         IMG_0249-AT-2020-02-10-rabbit           1.0   
108   Anteater_1      AT          IMG_0628-AT-2019-10-17-human           1.0   
803   Anteater_2      AT         IMG_1148-AT-2019-12-17-rabbit           3.0   
2767  Anteater_2      AT          IMG_2797-AT-2020-03-26-human           1.0   
195   Anteater_2      AT          IMG_1072-AT-2019-10-24-human           1.0   

            Species  NumIndividuals    

In [24]:
anteaterSampleSpeciesCounts = anteaterSample.groupby("Species")["Species"].count()
print(anteaterSampleSpeciesCounts)
anteaterSampleTotal = anteaterSample["Species"].count()
print(anteaterSampleTotal)
print('\n')
anteaterSampleSpeciesProportions = 100*anteaterSampleSpeciesCounts/anteaterSampleTotal
print(anteaterSpeciesProportions)

Species
coyote            19
dog and human     10
domestic dog       2
human            111
insect             1
rabbit           146
raccoon            1
vehicle            1
Name: Species, dtype: int64
291


Species
animal and human     0.377488
bird                 1.407001
coyote               5.284832
dog and human        3.774880
domestic dog         0.549073
human               40.219629
insect               0.137268
rabbit              47.940975
raccoon              0.068634
vehicle              0.102951
wild animals         0.137268
Name: Species, dtype: float64


## Section 2: Making training samples

In [32]:
def animalTrainingSample(cameraCode: "2 letter camera code", cameraName: str, trainingFraction: float) -> None:
    """Makes a csv file of photos from a camera to be included in a training set"""
    cameraPhotos = relabeledPhotos[relabeledPhotos["LocCode"] == cameraCode]
    cameraSpeciesCounts = cameraPhotos.groupby("Species")["Species"].count()
    cameraTotal = cameraPhotos["Species"].count()
    print("Species proportions for {0} out of {1} photos:".format(cameraName, cameraTotal))
    print(100*cameraSpeciesCounts/cameraTotal)
    print('\n')
    
    trainingSet = cameraPhotos.sample(frac = trainingFraction)
    trainingSetSpeciesCounts = trainingSet.groupby("Species")["Species"].count()
    trainingSetTotal = trainingSet["Species"].count()
    print("Species counts for the training set of {0} out of {1} training photos:".format(cameraName, trainingSetTotal))
    print(100*trainingSetSpeciesCounts/trainingSetTotal)
    print('\n')
    print('\n')
    title = "{} training photos - animals only.csv".format(cameraName)
    trainingSet.to_csv(title, index = False)
    return

In [33]:
# This is the fraction of animal photos that will be included in the training sample.
# Please change this fraction as necessary
trainingFractionPerCamera = 0.10

animalTrainingSample("RP", "Research Park", trainingFractionPerCamera)
animalTrainingSample("MT", "Historical Marsh Trail", trainingFractionPerCamera)
animalTrainingSample("MB", "MacArthur Bridge", trainingFractionPerCamera)
animalTrainingSample("CB", "Culver culvert", trainingFractionPerCamera)
animalTrainingSample("CH", "Concordia culvert", trainingFractionPerCamera)
animalTrainingSample("BC", "Bonita Canyon Bridge", trainingFractionPerCamera)

Species proportions for Research Park out of 436 photos:
Species
bird        12.155963
coyote      72.247706
human        3.669725
insect       0.458716
lizard       3.669725
mouse        0.458716
opossum      1.376147
rabbit       2.752294
raccoon      0.458716
rat          1.376147
snake        0.688073
squirrel     0.688073
Name: Species, dtype: float64


Species counts for the training set of Research Park out of 43 training photos:
Species
bird        11.627907
coyote      62.790698
human        4.651163
opossum      2.325581
rabbit       6.976744
raccoon      2.325581
rat          2.325581
snake        2.325581
squirrel     4.651163
Name: Species, dtype: float64




Species proportions for Historical Marsh Trail out of 1410 photos:
Species
bird             1.560284
coyote          15.815603
domestic dog     0.141844
human            1.205674
mouse            0.070922
opossum          0.283688
rabbit          74.255319
raccoon          5.390071
rat              0.496454
squirrel  