#Convolutional Neural Network for identifying 3 most commonly mistaken species of gulls in Poland - *Larus argentatus*, *Larus cachinnans* and *Larus michachellis*

In [65]:
# Import Tensorflow 2.0
%tensorflow_version 2.x
import tensorflow as tf 

import matplotlib.pyplot as plt
import numpy as np
import random
import re
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
import requests
import collections

# Check that we are using a GPU, if not switch runtimes
#   using Runtime > Change Runtime Type > GPU
assert len(tf.config.list_physical_devices('GPU')) > 0

Download images from [Gull Research website](http://www.gull-research.org/)

In [70]:
urls = ['http://www.gull-research.org/hg/hg1cy/', 'http://www.gull-research.org/hg/hg2cy/',
        'http://www.gull-research.org/hg/hg3cy/', 'http://www.gull-research.org/hg/hg4cy/',
        'http://www.gull-research.org/hg/hg5cy/']
image_urls = collections.defaultdict(list)

def list_files(url, ext='html'):
    page = requests.get(url).text
    soup = bs(page, 'html.parser')
    return [url + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]
  
def get_all_image_urls(site):
    response = requests.get(site)

    soup = BeautifulSoup(response.text, 'html.parser')
    img_tags = soup.find_all('img')

    urls = [img['src'] for img in img_tags]
    image_urls = []
    for url in urls:
        filename = re.search(r'/([\w_-]+[.](jpg|png|JPG|PNG))$', url)
        if not filename:
            continue
        if 'http' not in url:
            # sometimes an image source can be relative 
            # if it is provide the base url which also happens 
            # to be the site variable atm.
            site_level_below = '/'.join(site.split('/')[:-2])
            url_with_no_dots = '/'.join(url.split('/')[1:]) 
            url = f"{site_level_below}/{url_with_no_dots}"
        image_urls.append(url)
    return image_urls

for age, url in enumerate(urls):
  for site in list_files(url):
    urls_from_site = get_all_image_urls(site)
    
    if urls_from_site is not None:
      for image_url in urls_from_site:
        image_urls[age].append(image_url)

print(image_urls)
print(image_urls[2])


defaultdict(<class 'list'>, {0: ['http://www.gull-research.org/hg/201701/00_38A4308.jpg', 'http://www.gull-research.org/hg/201701/00_38A4303.jpg', 'http://www.gull-research.org/hg/201701/00_38A4316.jpg', 'http://www.gull-research.org/hg/images01/00gr_38A8211.jpg', 'http://www.gull-research.org/hg/images01/00gr_38A8217.jpg', 'http://www.gull-research.org/hg/images01/00gr_38A8227.jpg', 'http://www.gull-research.org/hg/201802/hg_01CY10_w03_38A0742.jpg', 'http://www.gull-research.org/hg/201802/hg_01CY10_w03_38A3175.jpg', 'http://www.gull-research.org/hg/0a/w03117954_08cb6af46c_o.jpg', 'http://www.gull-research.org/hg/0d/48843638296_bd26cab781_o.jpg', 'http://www.gull-research.org/hg/0d/_38A2035.JPG', 'http://www.gull-research.org/hg/0a/w0g797_4dc8c858bf_o.jpg', 'http://www.gull-research.org/hg/0a/0gbarf9bd_o.jpg', 'http://www.gull-research.org/hg/0a/w0g420_f0e1863390_o.jpg', 'http://www.gull-research.org/hg/images21/b0-20141223-1.jpg', 'http://www.gull-research.org/hg/images21/b0-20141223-