From b8d75f23c7c6514bf9b0b6c52caafbe8ddc3715a Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Wed, 19 Jul 2023 23:55:26 +0200 Subject: [PATCH 01/12] Removed commented old logic --- src/scraping/search_engines.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/scraping/search_engines.py b/src/scraping/search_engines.py index e7835ba..6e4e39e 100644 --- a/src/scraping/search_engines.py +++ b/src/scraping/search_engines.py @@ -151,22 +151,6 @@ def get_image_urls(self, thumbnail_results): query.pop("h", None) img_url = img_url._replace(query=urlencode(query, True)) img_urls.append(img_url.geturl()) - - # * Old logic, where clicking on thumbnail opened more details on image - # try: - # thumbnail.click() # try to click thumbnail to get img src - # self.sleep() - # except Exception: - # continue - # - # image = retry(self.wd.find_element, [By.ID, 'img']) - # if image.get_attribute('src') and 'http' in image.get_attribute('src'): - # img_urls.append(image.get_attribute('src')) - # - # # close view - # close = retry(self.wd.find_elements, [By.CLASS_NAME, "close"])[0] - # close.click() - # self.sleep() return img_urls def click_show_more_button(self): From 2a8dac7476aaa955b8fb30234771f4dc7f487420 Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Wed, 19 Jul 2023 23:58:28 +0200 Subject: [PATCH 02/12] Updated readme --- readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 5b320e1..d4def47 100644 --- a/readme.md +++ b/readme.md @@ -24,7 +24,7 @@ the [project page][project page] if you are interested in creation a dataset for Start the front end with a single command (adjust the `/PATH/TO/OUTPUT` to your desired output path) ```shell -docker run -it --rm --name easy_image_scraping --network host --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output -p 5000:5000 ghcr.io/a-nau/easy-image-scraping:latest +docker run -it --rm --name easy_image_scraping --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output -p 5000:5000 ghcr.io/a-nau/easy-image-scraping:latest ``` Enter your query and wait for the results to show in the `output` folder. The web applications also shows a preview of From 590f5b1befed37d8a9b12dbd83b6b131903fd566 Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Wed, 19 Jul 2023 23:59:31 +0200 Subject: [PATCH 03/12] Added currently used engine to output --- src/scraping/fetch_image_urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scraping/fetch_image_urls.py b/src/scraping/fetch_image_urls.py index 3535be3..8d00b38 100644 --- a/src/scraping/fetch_image_urls.py +++ b/src/scraping/fetch_image_urls.py @@ -37,7 +37,7 @@ def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver.Chrome, print("Found: {} image links. Continue ...".format(number_results)) scroll_count += 1 print( - f"Found: {number_results} search results. Extracting first {max_links_to_fetch} links" + f"Found: {number_results} search results using {engine}. Extracting first {max_links_to_fetch} links" ) image_urls = search_engine.get_image_urls(thumbnail_results[:max_links_to_fetch]) print(f"Found: {len(image_urls[:max_links_to_fetch])} image links, done!") From 6321d1d06a797bfc8675299ef73a07806730dd1b Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Thu, 20 Jul 2023 00:07:30 +0200 Subject: [PATCH 04/12] Undone some changes to make Dockerfile work --- Dockerfile | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8aa307d..d9b8b42 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,20 +2,16 @@ FROM python:3.9-slim # install google chrome RUN apt-get update \ - && apt-get install -y wget curl gnupg -ARG CHROME_VERSION="114.0.5735.198-1" + && apt-get install -y wget curl gnupg RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - -RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' +RUN sh -c 'echo "deb [arch=amd64] https://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' RUN apt-get -y update -RUN apt-get install -y google-chrome-stable=${CHROME_VERSION} - +RUN apt-get install -y google-chrome-stable # install chromedriver RUN apt-get install -yqq unzip -# Install ChromeDriver -RUN CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_VERSION%.*}` \ - && wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/${CHROME_DRIVER_VERSION}/chromedriver_linux64.zip \ - && unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ +RUN wget -O /tmp/chromedriver.zip https://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip +RUN unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ RUN mkdir -p /usr/src/app WORKDIR /usr/src/app From b8f50a7911372508e290c58aa7c0e8e22a24e48e Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Tue, 25 Jul 2023 23:22:18 +0200 Subject: [PATCH 05/12] Added setting to clear output folder --- src/tools/clear_output_folder.py | 18 ++++++++++++++++++ src/tools/frontend.py | 4 ++++ 2 files changed, 22 insertions(+) create mode 100644 src/tools/clear_output_folder.py diff --git a/src/tools/clear_output_folder.py b/src/tools/clear_output_folder.py new file mode 100644 index 0000000..fa1bd61 --- /dev/null +++ b/src/tools/clear_output_folder.py @@ -0,0 +1,18 @@ +import sys +import os + +from pathlib import Path + +base_path = Path(__file__).parent.parent.parent +sys.path.append(base_path.as_posix()) +from src.config import TARGET_PATH + +def clear_output_folder(): + if not os.path.isdir(TARGET_PATH): + return + for root, dirs, files in os.walk(TARGET_PATH, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + print(f"Output folder cleared.") \ No newline at end of file diff --git a/src/tools/frontend.py b/src/tools/frontend.py index 36a7662..dc5c3cf 100644 --- a/src/tools/frontend.py +++ b/src/tools/frontend.py @@ -8,6 +8,7 @@ sys.path.append(base_path.as_posix()) from src.tools.search_by_keyword import search_by_keyword +from src.tools.clear_output_folder import clear_output_folder def main(): @@ -27,9 +28,12 @@ def main(): num_images = st.slider( "Number of Images to Download", min_value=1, max_value=500, value=10 ) + clear_output = st.checkbox("Clear Output Folder", value=False, help="Remove all files and folders in output folder.") # Run search if st.button("Start Search", disabled=search_query == ""): + if clear_output: + clear_output_folder() for query in search_query.split(";"): query = query.strip() search_by_keyword( From 1b60e8c24880020960dace2ba5f1e52c9cbf85c6 Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Wed, 26 Jul 2023 00:19:34 +0200 Subject: [PATCH 06/12] Updated readme --- readme.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index d4def47..bb004c8 100644 --- a/readme.md +++ b/readme.md @@ -56,10 +56,11 @@ This is optional - you can also directly use our provided container. ### Docker -You can also build the image yourself using +You can also build and run the image yourself using ```shell -docker build -t easy_image_scraping . +docker build -t easy_image_scraping . ; docker run -it --rm --name easy_image_scraping -p 5000:5000 --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output easy_image_scraping + ```
From 1e62c1a01dac357a98ca6c208211e36fef4e7b8e Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Wed, 26 Jul 2023 00:25:18 +0200 Subject: [PATCH 07/12] Updated readme --- readme.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/readme.md b/readme.md index bb004c8..961a31e 100644 --- a/readme.md +++ b/readme.md @@ -59,8 +59,7 @@ This is optional - you can also directly use our provided container. You can also build and run the image yourself using ```shell -docker build -t easy_image_scraping . ; docker run -it --rm --name easy_image_scraping -p 5000:5000 --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output easy_image_scraping - +docker build -t easy_image_scraping . && docker run -it --rm --name easy_image_scraping -p 5000:5000 --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output easy_image_scraping ```
From 20954e086c5d4d9e7402badb27e329db78a13fc5 Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Wed, 26 Jul 2023 00:41:20 +0200 Subject: [PATCH 08/12] Added new line on EOF --- src/tools/clear_output_folder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/clear_output_folder.py b/src/tools/clear_output_folder.py index fa1bd61..c134804 100644 --- a/src/tools/clear_output_folder.py +++ b/src/tools/clear_output_folder.py @@ -15,4 +15,4 @@ def clear_output_folder(): os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) - print(f"Output folder cleared.") \ No newline at end of file + print(f"Output folder cleared.") From bb8350033c36f669886461ea8862ddcebb2a9896 Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:14:27 +0200 Subject: [PATCH 09/12] Updated readme --- readme.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index 961a31e..f598ebd 100644 --- a/readme.md +++ b/readme.md @@ -56,10 +56,14 @@ This is optional - you can also directly use our provided container. ### Docker -You can also build and run the image yourself using +You can also build the image yourself using ```shell -docker build -t easy_image_scraping . && docker run -it --rm --name easy_image_scraping -p 5000:5000 --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output easy_image_scraping +docker build -t easy_image_scraping . +``` +The run it by using +```shell +docker run -it --rm --name easy_image_scraping -p 5000:5000 --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output easy_image_scraping ```
From e93f7c6cfb8226022f251d0e78bbd8322fb95bd5 Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Fri, 28 Jul 2023 21:42:05 +0200 Subject: [PATCH 10/12] Revert "Undone some changes to make Dockerfile work" This reverts commit 6321d1d06a797bfc8675299ef73a07806730dd1b. --- Dockerfile | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index d9b8b42..8aa307d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,16 +2,20 @@ FROM python:3.9-slim # install google chrome RUN apt-get update \ - && apt-get install -y wget curl gnupg + && apt-get install -y wget curl gnupg +ARG CHROME_VERSION="114.0.5735.198-1" RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - -RUN sh -c 'echo "deb [arch=amd64] https://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' +RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' RUN apt-get -y update -RUN apt-get install -y google-chrome-stable +RUN apt-get install -y google-chrome-stable=${CHROME_VERSION} + # install chromedriver RUN apt-get install -yqq unzip -RUN wget -O /tmp/chromedriver.zip https://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip -RUN unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ +# Install ChromeDriver +RUN CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_VERSION%.*}` \ + && wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/${CHROME_DRIVER_VERSION}/chromedriver_linux64.zip \ + && unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ RUN mkdir -p /usr/src/app WORKDIR /usr/src/app From c9c9ed135ec970b4107ff28a6d26622339b035f6 Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Fri, 28 Jul 2023 22:01:54 +0200 Subject: [PATCH 11/12] Fix for installing old Google Chrome version --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8aa307d..5cd3eb4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,8 +7,8 @@ ARG CHROME_VERSION="114.0.5735.198-1" RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' RUN apt-get -y update -RUN apt-get install -y google-chrome-stable=${CHROME_VERSION} - +RUN wget https://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_${CHROME_VERSION}_amd64.deb +RUN apt-get install -y ./google-chrome-stable_${CHROME_VERSION}_amd64.deb # install chromedriver RUN apt-get install -yqq unzip From ea710a378592b12ba302f575f6ea2a05ac4f1940 Mon Sep 17 00:00:00 2001 From: Max Seelig <7907436+m-a-x-s-e-e-l-i-g@users.noreply.github.com> Date: Fri, 28 Jul 2023 23:02:53 +0200 Subject: [PATCH 12/12] Moved function --- src/tools/clear_output_folder.py | 18 ------------------ src/tools/frontend.py | 12 +++++++++++- 2 files changed, 11 insertions(+), 19 deletions(-) delete mode 100644 src/tools/clear_output_folder.py diff --git a/src/tools/clear_output_folder.py b/src/tools/clear_output_folder.py deleted file mode 100644 index c134804..0000000 --- a/src/tools/clear_output_folder.py +++ /dev/null @@ -1,18 +0,0 @@ -import sys -import os - -from pathlib import Path - -base_path = Path(__file__).parent.parent.parent -sys.path.append(base_path.as_posix()) -from src.config import TARGET_PATH - -def clear_output_folder(): - if not os.path.isdir(TARGET_PATH): - return - for root, dirs, files in os.walk(TARGET_PATH, topdown=False): - for name in files: - os.remove(os.path.join(root, name)) - for name in dirs: - os.rmdir(os.path.join(root, name)) - print(f"Output folder cleared.") diff --git a/src/tools/frontend.py b/src/tools/frontend.py index dc5c3cf..4b8ca53 100644 --- a/src/tools/frontend.py +++ b/src/tools/frontend.py @@ -1,3 +1,4 @@ +import os import random import sys from pathlib import Path @@ -7,8 +8,8 @@ base_path = Path(__file__).parent.parent.parent sys.path.append(base_path.as_posix()) +from src.config import TARGET_PATH from src.tools.search_by_keyword import search_by_keyword -from src.tools.clear_output_folder import clear_output_folder def main(): @@ -52,6 +53,15 @@ def main(): images_sample = random.sample(images, min(len(images), 20)) ex.image(images_sample, width=200) +def clear_output_folder(): + if not os.path.isdir(TARGET_PATH): + return + for root, dirs, files in os.walk(TARGET_PATH, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + print(f"Output folder cleared.") if __name__ == "__main__": main()