In [2]:
import sys
import os
import requests
from re import sub, finditer
from bs4 import BeautifulSoup
from io import BytesIO
from PIL import Image

In [3]:
def download_file(url, dest_dir, filename=None):
    if(filename is None):
        filename = urlsplit(url).path.split("/")[-1]
        
    request = requests.get(url)
    image = Image.open(BytesIO(request.content))
    
    if image.mode in ('RGBA', 'LA'):
        #background = Image.new(image.mode[:-1], image.size, fill_color)
        #background.paste(image, image.split()[-1])
        background = img_alpha_to_colour(image)
        image = background
    
    image.save(os.path.join(dest_dir, filename))

In [4]:
def img_alpha_to_colour(image, color=(255, 255, 255)):
    image.load()  # needed for split()
    background = Image.new('RGB', image.size, color)
    background.paste(image, mask=image.split()[3])  # 3 is the alpha channel
    return background

In [5]:
def get_file_urls(soup):
    file_urls = []
    
    # fix href errors
    for anchor in soup.find_all(attrs={'class':'fileThumb'}):
        file_urls.append(sub("//", "https://", anchor.get('href')))
    return file_urls

In [6]:
def get_filenames(soup):
    filenames = []

    for anchor in soup.find_all(attrs={'class': 'fileText'}):
        filenames.append(anchor.get_text().split(" ")[1])
        print(filenames.append(anchor.get_text().split(" ")[1]))
    return filenames

In [12]:
dest_base_dir = os.getcwd()
dest_dir = (dest_base_dir + '/dataset_3')

In [15]:
def main(url, dest_dir):
    print("┌─────────────────────────────────────────────────────────────────────┐")
    print("│           Image Scraping Tool built by Steven Karmaniolos           │")
    print("└─────────────────────────────────────────────────────────────────────┘")
    print("Downloading images from " + url + " ...")
    
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")

    file_urls = get_file_urls(soup)
    filenames = get_filenames(soup)

    for i in range(len(file_urls)):
        print("Downloading file: " + filenames[i])
        download_file(file_urls[i], dest_dir, filenames[i])

    print("┌─────────────────────────────────────────────────────────────────────┐")
    print("│             Scrape completed. Your files are now ready.             │")
    print("└─────────────────────────────────────────────────────────────────────┘")

In [16]:
dest_dir

'/home/helpthx/Desktop/TCC-1/TCC-1-UnB/dataset_3'

In [20]:
main("http://www.webcrawler.com/serp?q=Actinic+keratosis&sc=qAhiEuh410VRRoZs2CMIbjhWoH2tXmqsl6LlH24XMJnnxzjnbhIyINd4eIElt5x5UeKRSloPT7lo9EKKiDTp8qK_lhGq2Jjn2zpryot8Y0OPnA0rlZuvunjFrAY8DoFb7mOCNHf62Xn5aCWbip1D94L40s8dhrJLtRCWgF8YwO_vqYPvAcR8CZ48q7m8KBM0cWotmzvpOWc7r9Ne7OEJ4CjA1_sckr0OTlsOsYdPU39s9YkK4oD1bs3z_PT3XOweS1nxPvPfujoRivqhqaUYkJxFGstPGArWdVcpdQpjd4Lt8Fjj2jpsJ32g0PP9-GoFhyx-fcjLgSOSHdgVjoFMrx2BlPH25u5s6bOUz1GyO0J8T8FVabWCsqfjB3-PATNLoeJ2LU0GgPh4TwRTv5OMKeS5BADQkCU81ATNeku6hkcbuCmgoHdSlJrrDOf3Cxz7sGxgrMpyQNiRUdoCjJjhTpssk-yJKceNDjGY262HjLbMwDuYPjawtelE28t8NpEf1REAbPiLGEumqT2nW36oMoVPK2q9bSvyF1EBE0ejfRg3XYYfHBJjaM4mb7E35o49wU0R4qg59mt3D2tOuG07hEgn9Ft8AqIYaTBg8ht9BhFChl_qXXPsSR8HmbBdBmRK-gAJEnmNSK4l0hv8xvof1GOJrt6jW1absg4sLmYLT7At702l6f9cMxUTEhkmfjeMEEo9FfXdGV1rdVn9CJRC-iUy9HstWSCBnuSqaFRoe3b5oYBOfm0ERcL6dKs0ZDDe1oX57hgQXCjTx-NUZqCLcvLrIS6-_zff_WzUmADgUxIywpjr1lvM4-WhAADe3aaaWNZX8mPXGU-Xv5B67WO9F-UneoL0aOd7Lh6-pLq5RIgK18g7-dikx84v43MaVioylM-AKuu38Si0xugDXqzPcyOItlTleSKPnCMKxHiiPc6PdCwaX7fSCDwTY7FBcr_CgfKveaWZyzYwVDVnT00NCJyL8qefZzNozKDFX1aBSL6vrNrFFDtnTXQ7uMLsh1VFrmtBmPrpvMyZLE518PW162e8F-KlYjGYr8qmvdDK35DXWhokbdzZtwn2p1_LevmWcHnk1LMcD8goyvKYbFoE7V4X_eWvSRrOOx09yDGAeRkpu7IsHlXistVviTN_NqDPQ3-ZG1JO7N0epvNGQFO2Rq3_LBRWBLz8uRg2In1SnLGDm6Su5Uq-WZWxwLlTDKRUK93odxV40DixNmeRCmPvMgEyApUEpGxknX1V3dcZPUhDBFck9ke5tJgf-N_A3jUYOdjo7MqzhLTVsT1ottJ1qep80pdWNe9_Ov4SKA1GuJoY_W4SCM7rMRFxLdWe-Gl5cawsmT_cgjFp0dCrFkV8KxPxznj7PWjIqaCg1bpfaQMo4pnOITRtBEpR0_taW0X19SDFk2qOtSMUjqn_C3ueKhtBxdTwcDntML-gJ1riL9O6Tc8yNTUGnE3hAdm8C3q5NSecFy1tAAvcpVNiGymPvx3eNoXQyA_eNpO-3myEdG773saXdwQUEbV-FYMe0vAjEkOmw8-0tb7ycCi1kJKdLg-sGpN1GCdUeMnWmFdmAr4eY4VnSL0a3u31tH9e0H5tshjITSx8yQAOi5ZtCv58vuG2DsCdsZ40alOvsIcQJd5SWgovSNxsVt-azGIhsc7xm0AJplHQipLpkujw5QMlsv0hHGoCKd51mDRpMUs", dest_dir)

┌─────────────────────────────────────────────────────────────────────┐
│           Image Scraping Tool built by Steven Karmaniolos           │
└─────────────────────────────────────────────────────────────────────┘
Downloading images from http://www.webcrawler.com/serp?q=Actinic+keratosis&sc=qAhiEuh410VRRoZs2CMIbjhWoH2tXmqsl6LlH24XMJnnxzjnbhIyINd4eIElt5x5UeKRSloPT7lo9EKKiDTp8qK_lhGq2Jjn2zpryot8Y0OPnA0rlZuvunjFrAY8DoFb7mOCNHf62Xn5aCWbip1D94L40s8dhrJLtRCWgF8YwO_vqYPvAcR8CZ48q7m8KBM0cWotmzvpOWc7r9Ne7OEJ4CjA1_sckr0OTlsOsYdPU39s9YkK4oD1bs3z_PT3XOweS1nxPvPfujoRivqhqaUYkJxFGstPGArWdVcpdQpjd4Lt8Fjj2jpsJ32g0PP9-GoFhyx-fcjLgSOSHdgVjoFMrx2BlPH25u5s6bOUz1GyO0J8T8FVabWCsqfjB3-PATNLoeJ2LU0GgPh4TwRTv5OMKeS5BADQkCU81ATNeku6hkcbuCmgoHdSlJrrDOf3Cxz7sGxgrMpyQNiRUdoCjJjhTpssk-yJKceNDjGY262HjLbMwDuYPjawtelE28t8NpEf1REAbPiLGEumqT2nW36oMoVPK2q9bSvyF1EBE0ejfRg3XYYfHBJjaM4mb7E35o49wU0R4qg59mt3D2tOuG07hEgn9Ft8AqIYaTBg8ht9BhFChl_qXXPsSR8HmbBdBmRK-gAJEnmNSK4l0hv8xvof1GOJrt6jW1absg4sLmYLT7At702l6f9cMxUTEhkmfjeMEE

In [21]:
image-scraper http://www.webcrawler.com/serp?q=Actinic+keratosis&sc=qAhiEuh410VRRoZs2CMIbjhWoH2tXmqsl6LlH24XMJnnxzjnbhIyINd4eIElt5x5UeKRSloPT7lo9EKKiDTp8qK_lhGq2Jjn2zpryot8Y0OPnA0rlZuvunjFrAY8DoFb7mOCNHf62Xn5aCWbip1D94L40s8dhrJLtRCWgF8YwO_vqYPvAcR8CZ48q7m8KBM0cWotmzvpOWc7r9Ne7OEJ4CjA1_sckr0OTlsOsYdPU39s9YkK4oD1bs3z_PT3XOweS1nxPvPfujoRivqhqaUYkJxFGstPGArWdVcpdQpjd4Lt8Fjj2jpsJ32g0PP9-GoFhyx-fcjLgSOSHdgVjoFMrx2BlPH25u5s6bOUz1GyO0J8T8FVabWCsqfjB3-PATNLoeJ2LU0GgPh4TwRTv5OMKeS5BADQkCU81ATNeku6hkcbuCmgoHdSlJrrDOf3Cxz7sGxgrMpyQNiRUdoCjJjhTpssk-yJKceNDjGY262HjLbMwDuYPjawtelE28t8NpEf1REAbPiLGEumqT2nW36oMoVPK2q9bSvyF1EBE0ejfRg3XYYfHBJjaM4mb7E35o49wU0R4qg59mt3D2tOuG07hEgn9Ft8AqIYaTBg8ht9BhFChl_qXXPsSR8HmbBdBmRK-gAJEnmNSK4l0hv8xvof1GOJrt6jW1absg4sLmYLT7At702l6f9cMxUTEhkmfjeMEEo9FfXdGV1rdVn9CJRC-iUy9HstWSCBnuSqaFRoe3b5oYBOfm0ERcL6dKs0ZDDe1oX57hgQXCjTx-NUZqCLcvLrIS6-_zff_WzUmADgUxIywpjr1lvM4-WhAADe3aaaWNZX8mPXGU-Xv5B67WO9F-UneoL0aOd7Lh6-pLq5RIgK18g7-dikx84v43MaVioylM-AKuu38Si0xugDXqzPcyOItlTleSKPnCMKxHiiPc6PdCwaX7fSCDwTY7FBcr_CgfKveaWZyzYwVDVnT00NCJyL8qefZzNozKDFX1aBSL6vrNrFFDtnTXQ7uMLsh1VFrmtBmPrpvMyZLE518PW162e8F-KlYjGYr8qmvdDK35DXWhokbdzZtwn2p1_LevmWcHnk1LMcD8goyvKYbFoE7V4X_eWvSRrOOx09yDGAeRkpu7IsHlXistVviTN_NqDPQ3-ZG1JO7N0epvNGQFO2Rq3_LBRWBLz8uRg2In1SnLGDm6Su5Uq-WZWxwLlTDKRUK93odxV40DixNmeRCmPvMgEyApUEpGxknX1V3dcZPUhDBFck9ke5tJgf-N_A3jUYOdjo7MqzhLTVsT1ottJ1qep80pdWNe9_Ov4SKA1GuJoY_W4SCM7rMRFxLdWe-Gl5cawsmT_cgjFp0dCrFkV8KxPxznj7PWjIqaCg1bpfaQMo4pnOITRtBEpR0_taW0X19SDFk2qOtSMUjqn_C3ueKhtBxdTwcDntML-gJ1riL9O6Tc8yNTUGnE3hAdm8C3q5NSecFy1tAAvcpVNiGymPvx3eNoXQyA_eNpO-3myEdG773saXdwQUEbV-FYMe0vAjEkOmw8-0tb7ycCi1kJKdLg-sGpN1GCdUeMnWmFdmAr4eY4VnSL0a3u31tH9e0H5tshjITSx8yQAOi5ZtCv58vuG2DsCdsZ40alOvsIcQJd5SWgovSNxsVt-azGIhsc7xm0AJplHQipLpkujw5QMlsv0hHGoCKd51mDRpMUs

image-scraper  http://www.webcrawler.com/serp?q=Actinic+keratosis&sc=qAhiEuh410VRRoZs2CMIbjhWoH2tXmqsl6LlH24XMJnnxzjnbhIyINd4eIElt5x5UeKRSloPT7lo9EKKiDTp8qK_lhGq2Jjn2zpryot8Y0OPnA0rlZuvunjFrAY8DoFb7mOCNHf62Xn5aCWbip1D94L40s8dhrJLtRCWgF8YwO_vqYPvAcR8CZ48q7m8KBM0cWotmzvpOWc7r9Ne7OEJ4CjA1_sckr0OTlsOsYdPU39s9YkK4oD1bs3z_PT3XOweS1nxPvPfujoRivqhqaUYkJxFGstPGArWdVcpdQpjd4Lt8Fjj2jpsJ32g0PP9-GoFhyx-fcjLgSOSHdgVjoFMrx2BlPH25u5s6bOUz1GyO0J8T8FVabWCsqfjB3-PATNLoeJ2LU0GgPh4TwRTv5OMKeS5BADQkCU81ATNeku6hkcbuCmgoHdSlJrrDOf3Cxz7sGxgrMpyQNiRUdoCjJjhTpssk-yJKceNDjGY262HjLbMwDuYPjawtelE28t8NpEf1REAbPiLGEumqT2nW36oMoVPK2q9bSvyF1EBE0ejfRg3XYYfHBJjaM4mb7E35o49wU0R4qg59mt3D2tOuG07hEgn9Ft8AqIYaTBg8ht9BhFChl_qXXPsSR8HmbBdBmRK-gAJEnmNSK4l0hv8xvof1GOJrt6jW1absg4sLmYLT7At702l6f9cMxUTEhkmfjeMEEo9FfXdGV1rdVn9CJRC-iUy9HstWSCBnuSqaFRoe3b5oYBOfm0ERcL6dKs0ZDDe1oX57hgQXCjTx-NUZqCLcvLrIS6-_zff_WzUmADgUxIywpjr1lvM4-WhAADe3aaaWNZX8mPXGU-Xv5B67WO9F-UneoL0aOd7Lh6-pLq5RIgK18g7-dikx84v43MaVioylM-AKuu38Si0xugDXqzPcyOItlTleSKPnCMKxHiiPc6PdCwaX7fSCDwTY7FBcr_CgfKveaWZyzYwVDVnT00NCJyL8qefZzNozKDFX1aBSL6vrNrFFDtnTXQ7uMLsh1VFrmtBmPrpvMyZLE518PW162e8F-KlYjGYr8qmvdDK35DXWhokbdzZtwn2p1_LevmWcHnk1LMcD8goyvKYbFoE7V4X_eWvSRrOOx09yDGAeRkpu7IsHlXistVviTN_NqDPQ3-ZG1JO7N0epvNGQFO2Rq3_LBRWBLz8uRg2In1SnLGDm6Su5Uq-WZWxwLlTDKRUK93odxV40DixNmeRCmPvMgEyApUEpGxknX1V3dcZPUhDBFck9ke5tJgf-N_A3jUYOdjo7MqzhLTVsT1ottJ1qep80pdWNe9_Ov4SKA1GuJoY_W4SCM7rMRFxLdWe-Gl5cawsmT_cgjFp0dCrFkV8KxPxznj7PWjIqaCg1bpfaQMo4pnOITRtBEpR0_taW0X19SDFk2qOtSMUjqn_C3ueKhtBxdTwcDntML-gJ1riL9O6Tc8yNTUGnE3hAdm8C3q5NSecFy1tAAvcpVNiGymPvx3eNoXQyA_eNpO-3myEdG773saXdwQUEbV-FYMe0vAjEkOmw8-0tb7ycCi1kJKdLg-sGpN1GCdUeMnWmFdmAr4eY4VnSL0a3u31tH9e0H5tshjITSx8yQAOi5ZtCv58vuG2DsCdsZ40alOvsIcQJd5SWgovSNxsVt-azGIhsc7xm0AJplHQipLpkujw5QMlsv0hHGoCKd51mDRpMUs