Skip to content

Commit

Permalink
Merge pull request #5 from m-a-x-s-e-e-l-i-g/main
Browse files Browse the repository at this point in the history
Clear output folder setting, readme changes, Dockerfile changes
  • Loading branch information
a-nau committed Aug 4, 2023
2 parents e99f4ab + ea710a3 commit a3a8af3
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 20 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ ARG CHROME_VERSION="114.0.5735.198-1"
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
RUN apt-get -y update
RUN apt-get install -y google-chrome-stable=${CHROME_VERSION}

RUN wget https://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_${CHROME_VERSION}_amd64.deb
RUN apt-get install -y ./google-chrome-stable_${CHROME_VERSION}_amd64.deb

# install chromedriver
RUN apt-get install -yqq unzip
Expand Down
6 changes: 5 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ the [project page][project page] if you are interested in creation a dataset for
Start the front end with a single command (adjust the `/PATH/TO/OUTPUT` to your desired output path)

```shell
docker run -it --rm --name easy_image_scraping --network host --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output -p 5000:5000 ghcr.io/a-nau/easy-image-scraping:latest
docker run -it --rm --name easy_image_scraping --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output -p 5000:5000 ghcr.io/a-nau/easy-image-scraping:latest
```

Enter your query and wait for the results to show in the `output` folder. The web applications also shows a preview of
Expand Down Expand Up @@ -61,6 +61,10 @@ You can also build the image yourself using
```shell
docker build -t easy_image_scraping .
```
The run it by using
```shell
docker run -it --rm --name easy_image_scraping -p 5000:5000 --mount type=bind,source=/PATH/TO/OUTPUT,target=/usr/src/app/output easy_image_scraping
```

<details>
<summary>For Local Setup, check this</summary>
Expand Down
2 changes: 1 addition & 1 deletion src/scraping/fetch_image_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver.Chrome,
print("Found: {} image links. Continue ...".format(number_results))
scroll_count += 1
print(
f"Found: {number_results} search results. Extracting first {max_links_to_fetch} links"
f"Found: {number_results} search results using {engine}. Extracting first {max_links_to_fetch} links"
)
image_urls = search_engine.get_image_urls(thumbnail_results[:max_links_to_fetch])
print(f"Found: {len(image_urls[:max_links_to_fetch])} image links, done!")
Expand Down
16 changes: 0 additions & 16 deletions src/scraping/search_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,22 +151,6 @@ def get_image_urls(self, thumbnail_results):
query.pop("h", None)
img_url = img_url._replace(query=urlencode(query, True))
img_urls.append(img_url.geturl())

# * Old logic, where clicking on thumbnail opened more details on image
# try:
# thumbnail.click() # try to click thumbnail to get img src
# self.sleep()
# except Exception:
# continue
#
# image = retry(self.wd.find_element, [By.ID, 'img'])
# if image.get_attribute('src') and 'http' in image.get_attribute('src'):
# img_urls.append(image.get_attribute('src'))
#
# # close view
# close = retry(self.wd.find_elements, [By.CLASS_NAME, "close"])[0]
# close.click()
# self.sleep()
return img_urls

def click_show_more_button(self):
Expand Down
14 changes: 14 additions & 0 deletions src/tools/frontend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import random
import sys
from pathlib import Path
Expand All @@ -7,6 +8,7 @@
base_path = Path(__file__).parent.parent.parent
sys.path.append(base_path.as_posix())

from src.config import TARGET_PATH
from src.tools.search_by_keyword import search_by_keyword


Expand All @@ -27,9 +29,12 @@ def main():
num_images = st.slider(
"Number of Images to Download", min_value=1, max_value=500, value=10
)
clear_output = st.checkbox("Clear Output Folder", value=False, help="Remove all files and folders in output folder.")

# Run search
if st.button("Start Search", disabled=search_query == ""):
if clear_output:
clear_output_folder()
for query in search_query.split(";"):
query = query.strip()
search_by_keyword(
Expand All @@ -48,6 +53,15 @@ def main():
images_sample = random.sample(images, min(len(images), 20))
ex.image(images_sample, width=200)

def clear_output_folder():
if not os.path.isdir(TARGET_PATH):
return
for root, dirs, files in os.walk(TARGET_PATH, topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
print(f"Output folder cleared.")

if __name__ == "__main__":
main()

0 comments on commit a3a8af3

Please sign in to comment.