Skip to content

Commit

Permalink
#3 Bug was fixed, dedup added
Browse files Browse the repository at this point in the history
  • Loading branch information
artiomn committed Sep 9, 2020
1 parent 9555439 commit 54ef3df
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 31 deletions.
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

# Markdown articles image links fixer

Version 0.0.2.

Simple script to download images and replace image links in markdown documents.
I.e. you have Markdown document with HTTP links.
This script will find all links to images, download images and fix links in the document.
Expand All @@ -12,7 +14,13 @@ This script will find all links to images, download images and fix links in the
Syntax:

```
./images_extractor.py [-h] [-s SKIP_LIST] article_file_path
sage: images_extractor.py [-h] [-s SKIP_LIST] [-d IMAGES_DIRNAME]
[-p IMAGES_PUBLICPATH] [-a]
[-t DOWNLOADING_TIMEOUT] [-D] [--version]
article_file_path
Simple script to download images and replace image links in markdown
documents.
positional arguments:
article_file_path path to the article file in the Markdown format
Expand All @@ -28,6 +36,12 @@ optional arguments:
Public path to the folder of downloaded images
-a, --skip-all-incorrect
skip all incorrect images
-t DOWNLOADING_TIMEOUT, --downloading-timeout DOWNLOADING_TIMEOUT
how many seconds to wait before downloading will be
failed
-D, --dedup-with-hash
Deduplicate images, using content hash
--version return version number
```

Example:
Expand Down
111 changes: 82 additions & 29 deletions images_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import argparse
import hashlib
from mimetypes import guess_extension, types_map
import os
import re
Expand Down Expand Up @@ -66,23 +67,28 @@ def get_filename_from_url(req: requests.Response) -> Optional[str]:
class ImageDownloader:
allowed_url_prefixes = {'http', 'ftp'}

def __init__(self, article_path: str, skip_list: Optional[List[str]] = None, skip_all: bool = False,
img_dir_name: str = 'images', img_public_path: str = ''):
self.img_dir_name = img_dir_name
self.img_public_path = img_public_path
def __init__(self, article_path: str, skip_list: Optional[List[str]] = None, skip_all_errors: bool = False,
img_dir_name: str = 'images', img_public_path: str = '', downloading_timeout: float = -1,
deduplication: bool = False):
self._img_dir_name = img_dir_name
self._img_public_path = img_public_path
self._article_file_path = article_path
self._skip_list = sorted(skip_list) if skip_list is not None else []
self._images_dir = os.path.join(os.path.dirname(self._article_file_path), self.img_dir_name)
self._skip_all = skip_all
self._skip_list = set(skip_list) if skip_list is not None else []
self._images_dir = os.path.join(os.path.dirname(self._article_file_path), self._img_dir_name)
self._skip_all_errors = skip_all_errors
self._downloading_timeout = downloading_timeout
self._deduplication = deduplication

def download_images(self, images: List[str]) -> dict:
replacement_mapping = {}
hash_to_path_mapping = {}
skip_list = self._skip_list
img_count = len(images)
path_join = os.path.join
img_dir_name = self.img_dir_name
img_public_path = self.img_public_path
img_dir_name = self._img_dir_name
img_public_path = self._img_public_path
images_dir = self._images_dir
deduplication = self._deduplication

try:
os.makedirs(self._images_dir)
Expand All @@ -104,52 +110,93 @@ def download_images(self, images: List[str]) -> dict:
print(f'Downloading image {img_num + 1} of {img_count} from "{img_url}"...')

try:
img_response = ImageDownloader._download_image(img_url)
img_response = self._download_image(img_url)
except Exception as e:
if self._skip_all:
if self._skip_all_errors:
print(f'Warning: can\'t download image {img_num + 1}, error: [{str(e)}], '
'but processing will be continued, because `skip_all` flag is set')
'but processing will be continued, because `skip_all_errors` flag is set')
continue
raise

img_filename = get_filename_from_url(img_response)
img_path = path_join(images_dir, img_filename)
replacement_mapping.setdefault(img_url, path_join(img_public_path or img_dir_name, img_filename))
image_content = img_response.content

if deduplication:
# path_to_hash_mapping.
new_content_hash = hashlib.sha256(image_content).digest()
existed_file_name = hash_to_path_mapping.get(new_content_hash)
if existed_file_name is not None:
img_filename = existed_file_name
document_img_path = path_join(img_public_path or img_dir_name, img_filename)
replacement_mapping.setdefault(img_url, document_img_path)
continue
else:
hash_to_path_mapping[new_content_hash] = img_filename

document_img_path = path_join(img_public_path or img_dir_name, img_filename)
img_filename, document_img_path = self._correct_paths(replacement_mapping, document_img_path, img_url,
img_filename)

real_img_path = path_join(images_dir, img_filename)
replacement_mapping.setdefault(img_url, document_img_path)

ImageDownloader._write_image(img_path, img_response.content)
ImageDownloader._write_image(real_img_path, image_content)

return replacement_mapping

@staticmethod
def _download_image(image_url):
def _download_image(self, image_url: str):
"""
Download image file from the URL.
:param image_url: URL to download.
"""

try:
img_response = requests.get(image_url, allow_redirects=True)
img_response = requests.get(image_url, allow_redirects=True, timeout=self._downloading_timeout)
except requests.exceptions.SSLError:
print('Incorrect SSL certificate, trying to download without verifying...')
img_response = requests.get(image_url, allow_redirects=True, verify=False)
img_response = requests.get(image_url, allow_redirects=True, verify=False,
timeout=self._downloading_timeout)

if img_response.status_code != 200:
raise OSError(str(img_response))

return img_response

@staticmethod
def _write_image(img_path, data):
def _write_image(img_path: str, data: bytes):
"""
Write image data into the file.
"""

print(f'Image will be written to the file "{img_path}"...')
with open(img_path, 'wb') as img_file:
img_file.write(data)
img_file.close()

def _is_allowed_url_prefix(self, url):
return url in self.allowed_url_prefixes
def _is_allowed_url_prefix(self, url: str) -> bool:
for prefix in self.allowed_url_prefixes:
if url.startswith(prefix):
return True

return False

def _correct_paths(self, replacement_mapping, document_img_path, img_url, img_filename):
# Images can have similar name, but different URLs, but I want to save original filename, if possible.
for url, path in replacement_mapping.items():
if document_img_path == path and img_url != url:
img_filename = f'{hashlib.md5(img_url.encode()).hexdigest()}_{img_filename}'
document_img_path = os.path.join(self._img_public_path or self._img_dir_name, img_filename)
break

return img_filename, document_img_path


def main(arguments):
"""
Entrypoint.
"""

article_file = os.path.expanduser(arguments.article_file_path)
article_path = os.path.expanduser(arguments.article_file_path)
skip_list = arguments.skip_list
skip_all = arguments.skip_all_incorrect

Expand All @@ -164,13 +211,15 @@ def main(arguments):
else:
skip_list = [s.strip() for s in skip_list.split(',')]

ArticleTransformer(article_file,
ArticleTransformer(article_path,
ImageDownloader(
article_file,
skip_list,
skip_all,
arguments.images_dirname,
arguments.images_publicpath
article_path=article_path,
skip_list=skip_list,
skip_all_errors=skip_all,
img_dir_name=arguments.images_dirname,
img_public_path=arguments.images_publicpath,
downloading_timeout=arguments.downloading_timeout,
deduplication=arguments.dedup_with_hash
)
).run()

Expand All @@ -189,6 +238,10 @@ def main(arguments):
help='Public path to the folder of downloaded images')
parser.add_argument('-a', '--skip-all-incorrect', default=False, action='store_true',
help='skip all incorrect images')
parser.add_argument('-t', '--downloading-timeout', type=float, default=-1,
help='how many seconds to wait before downloading will be failed')
parser.add_argument('-D', '--dedup-with-hash', default=False, action='store_true',
help='Deduplicate images, using content hash')
parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}', help='return version number')

args = parser.parse_args()
Expand Down
74 changes: 74 additions & 0 deletions pkg/transformers/html/transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
Images extractor from markdown document.
"""

from lxml import html
from typing import List


__all__ = ['ArticleTransformer']


class ImgExtractor:
def run(self, doc):
"""
Find all images in HTML.
"""

tree = html.fromstring(doc)
images = tree.xpath('//img/@src')
# links = tree.xpath('//a/@href')

return images



class ImgExtExtension(Extension):
def extendMarkdown(self, md, md_globals):
img_ext = ImgExtractor(md)
md.treeprocessors.register(img_ext, 'imgext', 20)


class ArticleTransformer:
"""
Markdown article transformation class.
"""

def __init__(self, article_path: str, image_downloader):
self._image_downloader = image_downloader
self._article_file_path = article_path
self._md_conv = markdown.Markdown(extensions=[ImgExtExtension()])
self._replacement_mapping = {}

def _read_article(self) -> List[str]:
with open(self._article_file_path, 'r') as m_file:
self._md_conv.convert(m_file.read())

print(f'Images links count = {len(self._md_conv.images)}')
images = set(self._md_conv.images)
print(f'Unique images links count = {len(images)}')

return images

def _fix_document_urls(self) -> None:
print('Replacing images urls in the document...')
replacement_mapping = self._replacement_mapping
lines = []
with open(self._article_file_path, 'r') as infile:
for line in infile:
for src, target in replacement_mapping.items():
line = line.replace(src, target)
lines.append(line)

with open(self._article_file_path, 'w') as outfile:
for line in lines:
outfile.write(line)

def run(self):
"""
Run article conversion.
"""

self._replacement_mapping = self._image_downloader.download_images(self._read_article())
self._fix_document_urls()

1 change: 0 additions & 1 deletion pkg/transformers/md/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,3 @@ def run(self):

self._replacement_mapping = self._image_downloader.download_images(self._read_article())
self._fix_document_urls()

0 comments on commit 54ef3df

Please sign in to comment.