# Bird species exploration

In [1]:
import pandas as pd
import csv

## Importing and formatting birds directory

In [2]:
# Update the file path as needed
filepath_1 = '/Users/tristanspilker/code/reklips/Birds/bird_classes.txt'

# Read CSV without a header
birds_df = pd.read_csv(filepath_1, delimiter=',', header=None)

# Add a new column 'species_no' equal to the original index, starting at 1
birds_df['species_no'] = birds_df.index + 1

# Ensure that the columns are treated as strings before using .str accessor
birds_df[0] = birds_df[0].astype(str).str.strip()  # Assuming '0' is the scientific_name column
birds_df[1] = birds_df[1].astype(str).str.strip()  # Assuming '1' is the species_id column

birds_df.columns = ['scientific_name', 'species_id', 'species_no']

# Set the index explicitly starting at 1
birds_df.index = birds_df['species_no']

birds_df

Unnamed: 0_level_0,scientific_name,species_id,species_no
species_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,lithoptila abdounensis,/m/0100hjcf,1
2,kurrartapu johnnguyeni,/m/010hmtwz,2
3,zosterops kirki,/m/011l7vvz,3
4,emberiza goslingi,/m/011py2_8,4
5,bambolinetta lignitifila,/m/01264yxh,5
...,...,...,...
10978,aquila chrysaetos,/m/0m41v,10978
10979,mallard,/m/01hjj1,10979
10980,bald eagle,/m/01dx8,10980
10981,struthio camelus,/m/05n4y,10981


In [3]:
# birds_df.to_csv('birds_df.csv', index=True)

In [4]:
#! pwd

## Importing and formatting image links directories

### 1st file

In [5]:
%%time

# Specify the file path
filepath_2 = '/Users/tristanspilker/code/reklips/birds/bird_urls1.txt'

# Create a list to store the transformed data
transformed_data = []

# Open the file and read it line by line
with open(filepath_2, 'r') as file:
    reader = csv.reader(file, delimiter=',')
    
    # Iterate through each row in the file
    for index, row in enumerate(reader, start=1):  # Use enumerate to get both the row and its index
        # Skip rows with fewer than 2 fields (ID and at least one URL)
        if len(row) < 2:
            continue
        
        # Extract the ID
        bird_id = row[0]
        
        # Iterate through the URLs (starting from index 1)
        for url in row[1:]:
            # Append the transformed data to the list
            transformed_data.append({
                'species_id': bird_id,
                'image_url': url,
                'image_id': index  # New column tracking the original index
            })

# Create a DataFrame from the transformed data with index starting from 1
bird_images_1_df = pd.DataFrame(transformed_data, index=range(1, len(transformed_data) + 1))

# Strip whitespaces from the 'image_url' column
bird_images_1_df['image_url'] = bird_images_1_df['image_url'].str.strip()

# Set the name of the index column to 'index'
bird_images_1_df.index.name = 'index'

# Display the transformed DataFrame
bird_images_1_df.head(10)

CPU times: user 11.5 s, sys: 1.24 s, total: 12.8 s
Wall time: 13 s


Unnamed: 0_level_0,species_id,image_url,image_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,/m/02w6csr,http://www.taenos.com/img/ITIS/Dacnis-lineata-...,1
2,/m/02w4t3s,http://i.ytimg.com/vi/58c2xiuei-k/hqdefault.jpg,2
3,/m/02w4t3s,http://i4.ytimg.com/vi/58c2xiuei-k/hqdefault.jpg,2
4,/m/02w4t3s,https://i.ytimg.com/vi/58c2xiuei-k/hqdefault.jpg,2
5,/m/02w6w6c,http://farm9.static.flickr.com/8508/8585961853...,3
6,/m/0108t9qd,http://farm4.static.flickr.com/3246/2338256799...,4
7,/m/0108t9qd,https://c2.staticflickr.com/4/3246/2338256799_...,4
8,/m/0108t9qd,https://c4.staticflickr.com/4/3246/2338256799_...,4
9,/m/0108t9qd,https://c4.staticflickr.com/4/3246/2338256799_...,4
10,/m/0202dn,http://www.zooeco.com/Im3/Spinus%20spinus3.jpg,5


In [6]:
bird_images_1_df.shape

(5896931, 3)

In [7]:
bird_images_1_df['image_id'].max() + 1

2775486

### 2nd file

In [8]:
%%time

# Specify the file path
filepath_3 = '/Users/tristanspilker/code/reklips/birds/bird_urls2.txt'

# Create a list to store the transformed data
transformed_data_2 = []

# Find the starting image_id
start_image_id = bird_images_1_df['image_id'].max() + 1 if 'image_id' in bird_images_1_df else 1

# Open the file and read it line by line
with open(filepath_3, 'r') as file:
    reader = csv.reader(file, delimiter=',')
    
    # Iterate through each row in the file
    for index, row in enumerate(reader, start=start_image_id):  # Start image_id tracking at the specified value
        # Skip rows with fewer than 2 fields (ID and at least one URL)
        if len(row) < 2:
            continue
        
        # Extract the ID
        bird_id = row[0]
        
        # Iterate through the URLs (starting from index 1)
        for url in row[1:]:
            # Append the transformed data to the list
            transformed_data_2.append({
                'species_id': bird_id,
                'image_url': url,
                'image_id': index  # New column tracking the original index
            })

# Create a DataFrame from the transformed data with index starting from 1
bird_images_2_df = pd.DataFrame(transformed_data_2, index=range(1, len(transformed_data_2) + 1))

# Strip whitespaces from the 'image_url' column
bird_images_2_df['image_url'] = bird_images_2_df['image_url'].str.strip()

# Set the name of the index column to 'index'
bird_images_2_df.index.name = 'index'

# Display the transformed DataFrame
bird_images_2_df.head(10)

CPU times: user 8.98 s, sys: 1.47 s, total: 10.5 s
Wall time: 10.7 s


Unnamed: 0_level_0,species_id,image_url,image_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,farm4.staticflickr.com/3170/3026178962_d7d94df...,https://farm4.staticflickr.com/3170/3026178962...,2775486
2,/m/02w50qs,http://farm4.staticflickr.com/3170/3026178962_...,2775487
3,/m/02w50qs,https://farm4.staticflickr.com/3170/3026178962...,2775487
4,/m/0c8k9l,http://www.birds-of-north-america.net/images/M...,2775488
5,/m/0279n9b,http://ibc.lynxeds.com/files/pictures/DSCF4623...,2775489
6,/m/02txxw,http://ibc.lynxeds.com/files/pictures/DSCF4623...,2775490
7,/m/0279d9r,http://ibc.lynxeds.com/files/pictures/DSCF4623...,2775491
8,/m/0f0zs6,http://ibc.lynxeds.com/files/pictures/DSCF4623...,2775492
9,/m/026x134,http://ibc.lynxeds.com/files/pictures/DSCF4623...,2775493
10,/m/0117sm6v,http://ibc.lynxeds.com/files/pictures/DSCF4623...,2775494


In [9]:
bird_images_2_df.shape

(4460889, 3)

In [10]:
%%time

bird_images_master_df = pd.concat([bird_images_1_df, bird_images_2_df], axis=0)

bird_images_master_df.head(10)

CPU times: user 240 ms, sys: 143 ms, total: 382 ms
Wall time: 383 ms


Unnamed: 0_level_0,species_id,image_url,image_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,/m/02w6csr,http://www.taenos.com/img/ITIS/Dacnis-lineata-...,1
2,/m/02w4t3s,http://i.ytimg.com/vi/58c2xiuei-k/hqdefault.jpg,2
3,/m/02w4t3s,http://i4.ytimg.com/vi/58c2xiuei-k/hqdefault.jpg,2
4,/m/02w4t3s,https://i.ytimg.com/vi/58c2xiuei-k/hqdefault.jpg,2
5,/m/02w6w6c,http://farm9.static.flickr.com/8508/8585961853...,3
6,/m/0108t9qd,http://farm4.static.flickr.com/3246/2338256799...,4
7,/m/0108t9qd,https://c2.staticflickr.com/4/3246/2338256799_...,4
8,/m/0108t9qd,https://c4.staticflickr.com/4/3246/2338256799_...,4
9,/m/0108t9qd,https://c4.staticflickr.com/4/3246/2338256799_...,4
10,/m/0202dn,http://www.zooeco.com/Im3/Spinus%20spinus3.jpg,5


In [11]:
bird_images_master_df.shape

(10357820, 3)

### Plausability check | collecting some descriptive statistics about the data frame

Number of links per image:

In [12]:
%%time

bird_images_master_df['image_id'].value_counts()

CPU times: user 914 ms, sys: 248 ms, total: 1.16 s
Wall time: 1.27 s


image_id
732763     420
3124904    400
1442064    400
3124905    400
2721885    400
          ... 
11           1
10           1
9            1
6            1
3665413      1
Name: count, Length: 4887217, dtype: int64

Number of different images:

In [13]:
diff_img = bird_images_master_df['image_id'].nunique()
diff_img

4887217

Number of species:

In [14]:
num_spec = bird_images_master_df['species_id'].nunique()
num_spec

10983

Average number of images / species

In [15]:
img_p_spec = diff_img / num_spec
img_p_spec

444.980151142675

In [16]:
# bird_images_master_df.to_csv('bird_images_master_df.csv', index=True)

In [17]:
# ! pwd

## Joining all three tables

In [18]:
%%time

bird_master_df = pd.merge(bird_images_master_df, birds_df, on='species_id', how='inner')

columns_ = ['species_id']

bird_master_df = bird_master_df.drop(columns=columns_)

bird_master_df

CPU times: user 6.57 s, sys: 547 ms, total: 7.12 s
Wall time: 7.19 s


Unnamed: 0,image_url,image_id,scientific_name,species_no
0,http://www.taenos.com/img/ITIS/Dacnis-lineata-...,1,dacnis lineata,7319
1,http://41.media.tumblr.com/tumblr_m81kchMiQn1r...,8100,dacnis lineata,7319
2,http://madarbarat-lexikon.hu/images/dacnis%20c...,17844,dacnis lineata,7319
3,http://t3.gstatic.com/images?q=tbn:ANd9GcSG87H...,20557,dacnis lineata,7319
4,https://c2.staticflickr.com/4/3836/14737184263...,22123,dacnis lineata,7319
...,...,...,...,...
10357814,http://c5.statcounter.com/counter.php?sc_proje...,3371585,scolopax rosenbergi,181
10357815,https://c.statcounter.com/10601132/0/b39882cb/0/,3371585,scolopax rosenbergi,181
10357816,http://i74.servimg.com/u/f74/15/84/98/82/final...,3767414,scolopax rosenbergi,181
10357817,http://ecx.images-amazon.com/images/I/513ReDIt...,4612387,scolopax rosenbergi,181


In [26]:
bird_master_df.to_csv('bird_master_df.csv', index=True)

In [27]:
! pwd

/Users/tristanspilker/code/brunothormaehlen/birdies/notebooks


### Plausibility check

Number of links per image:

In [19]:
%%time

bird_master_df['image_id'].value_counts()

CPU times: user 668 ms, sys: 237 ms, total: 906 ms
Wall time: 915 ms


image_id
732763     420
1442064    400
1442070    400
1442067    400
1442066    400
          ... 
3315523      1
3345747      1
3350989      1
3445866      1
3496841      1
Name: count, Length: 4887216, dtype: int64

Number of different images:

In [20]:
diff_img = bird_images_master_df['image_id'].nunique()
diff_img

4887217

Number of species:

In [21]:
%%time

num_spec = bird_master_df['species_no'].nunique()
num_spec

CPU times: user 63.5 ms, sys: 13.1 ms, total: 76.5 ms
Wall time: 78.3 ms


10982

Average number of images / species

In [22]:
img_p_spec = diff_img / num_spec
img_p_spec

445.0206701875797

## Looping over joined data frame to retrieve images for selected birds (MVP #1)

In [23]:
# Importing list of selected bird provided by data analytics team

filepath_4 = '/Users/tristanspilker/code/reklips/Birds/selected_birds.csv'

selec_birds_df = pd.read_csv(filepath_4, delimiter=',')

# Drop the first row in place
selec_birds_df.drop(0, inplace=True)

# Rename the second column to 'scientific_name'
selec_birds_df = selec_birds_df.rename(columns={'Unnamed: 1': 'scientific_name'})

# Display the modified DataFrame
selec_birds_df

Unnamed: 0,Selected Birds - Berlin Birds,scientific_name
1,"acanthis cabaret,/m/02p1n8h",acanthis cabaret
2,"gavia arctica,/m/019277",gavia arctica
3,"melanitta fusca,/m/01xqg0",melanitta fusca
4,"aythya nyroca,/m/01srm0",aythya nyroca
5,"melanitta nigra,/m/01wz_x",melanitta nigra
6,"podiceps nigricollis,/m/01cybj",podiceps nigricollis
7,"gavia stellata,/m/019299",gavia stellata
8,"larus cachinnans,/m/0211f4",larus cachinnans
9,"poecile montanus,/m/01kvsq",poecile montanus
10,"geronticus eremita,/m/01jfd5",geronticus eremita


In [24]:
# Introduce species number based on birds input and dropping all redundant information

selec_birds_df  = pd.merge(selec_birds_df, birds_df, on='scientific_name', how='inner')

columns_tbd = ['Selected Birds - Berlin Birds', 'species_id', 'scientific_name']

selec_birds_df = selec_birds_df.drop(columns=columns_tbd)

selec_birds_df

Unnamed: 0,species_no
0,9369
1,10474
2,10477
3,10491
4,10492
5,10502
6,10512
7,10530
8,10535
9,10588


# Function (!) that retrieves images, drops them into folders, labels the files, etc.

In [25]:
### Viktor's initial suggestion


def test(df_images, species, mydir=False):
    size=(256, 256)
    number = # to be defined
    if mydir=False:
        mydir = os.getcwd()
        im_dir = f'{mydir}/{species}_{number}.png'

    df_ = df_images[df_images == species]
    url_im = df_.image_url
    response = requests.get(url_im)
    if response.status_code = 200:
        im = Image.open(BytesIO(response.content))
        im.thumbnail(size,Image.ANTIALIAS)
        im.save(im_dir)
    else:

SyntaxError: invalid syntax (1234571744.py, line 6)

In [None]:
#### Starting point


### Function that takes species_no as input

import requests
import PIL import Image
from io import BytesIO

def image_retrieval(bird_master_df, selection_df, mydir=False):
    
    size = (256, 256)
    
    # Create one folder per species_no and label corresponding to temp_df['species_no'], if it doesn't exist yet
    if mydir = False:
        mydir = os.getcw()
        # number should be 4 digits per species_no, start from 0001 and count up
        im_dir = f'{mydir}/{species_no}_{number}.png'

    ## Temporary merged data frame based on filtered version of bird_master_df
    # Should contain every row from bird_master_df that contains species_no of selection_df['species_no'] 
    temp_df = bird_master_df[bird_master_df['species_no'] == selection_df['species_no']]
    
    # Locate download links
    url_im = temp_df['image_url']
    
    # Attempt download via link ('image_url') in resulting data frame
    # Add image_id to a list if download was successful (i. e. response.status_code = 200)
    # If image_id in list, skip, otherwise, try next link
    if response.status_code = 200:
        im = Image.open(BytesIO(response.content))
        
        # Resize image using im.thumbnail(size, Image.ANTIALIAS) with 256 as maximum dimension while keeping original image proportions
        im.thumbnail(size,Image.ANTIALIAS)
        
        # Save resulting image as species_no_0000 via im.save in corresponding folder created above
        im.save(im_dir)
    else:

In [None]:
#### Current best version

import os
import requests
from PIL import Image
from io import BytesIO

def image_retrieval(bird_master_df, selection_df, mydir=None):
    
    # Set default size
    size = (256, 256)
    
    # Initialize image numbering variable
    number = 1

    # Temporary merged data frame based on filtered version of bird_master_df
    temp_df = bird_master_df[bird_master_df['species_no'] == selection_df['species_no'].iloc[0]]
    
    # Create one folder per species_no and label corresponding to temp_df['species_no'], if it doesn't exist yet
    if mydir is None:
        mydir = os.getcwd()

    # Iterate over rows in temp_df
    for index, row in temp_df.iterrows():
        
        # Locate download links
        url_im = row['image_url']

        # Attempt download via link ('image_url') from temp_df
        response = requests.get(url_im)

        # Check if the download was successful
        if response.status_code == 200:
            im = Image.open(BytesIO(response.content))
            
            # Resize image using im.thumbnail(size, Image.ANTIALIAS) with 256 as the maximum dimension while keeping original image proportions
            im.thumbnail(size, Image.ANTIALIAS)
            
            # Create folder if it doesn't exist
            species_folder = os.path.join(mydir, f"{row['species_no']}")
            os.makedirs(species_folder, exist_ok=True)

            # Save resulting image via im.save as [species_no]_[xxxx].png in the corresponding folder created above
            im_dir = os.path.join(species_folder, f"{row['species_no']}_{number:04d}.png")
            
            # Check if the file already exists, if not, save the image
            if not os.path.exists(im_dir):
                im.save(im_dir)
                print(f"Saved: {im_dir}")
            else:
                print(f"Skipped (already exists): {im_dir}")
            
            # Increment the numbering variable for the next image
            number += 1