#3 Bug was fixed, dedup added

artiomn · Sep 9, 2020 · 54ef3df · 54ef3df
1 parent 9555439
commit 54ef3df
Show file tree

Hide file tree

Showing 4 changed files with 171 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 # Markdown articles image links fixer
 
+Version 0.0.2.
+
 Simple script to download images and replace image links in markdown documents.
 I.e. you have Markdown document with HTTP links.
 This script will find all links to images, download images and fix links in the document.
@@ -12,7 +14,13 @@ This script will find all links to images, download images and fix links in the
 Syntax:
 
 ```
-./images_extractor.py [-h] [-s SKIP_LIST] article_file_path
+sage: images_extractor.py [-h] [-s SKIP_LIST] [-d IMAGES_DIRNAME]
+                           [-p IMAGES_PUBLICPATH] [-a]
+                           [-t DOWNLOADING_TIMEOUT] [-D] [--version]
+                           article_file_path
+
+Simple script to download images and replace image links in markdown
+documents.
 
 positional arguments:
   article_file_path     path to the article file in the Markdown format
@@ -28,6 +36,12 @@ optional arguments:
                         Public path to the folder of downloaded images
   -a, --skip-all-incorrect
                         skip all incorrect images
+  -t DOWNLOADING_TIMEOUT, --downloading-timeout DOWNLOADING_TIMEOUT
+                        how many seconds to wait before downloading will be
+                        failed
+  -D, --dedup-with-hash
+                        Deduplicate images, using content hash
+  --version             return version number
 ```
 
 Example:

diff --git a/images_extractor.py b/images_extractor.py
@@ -5,6 +5,7 @@
 """
 
 import argparse
+import hashlib
 from mimetypes import guess_extension, types_map
 import os
 import re
@@ -66,23 +67,28 @@ def get_filename_from_url(req: requests.Response) -> Optional[str]:
 class ImageDownloader:
     allowed_url_prefixes = {'http', 'ftp'}
 
-    def __init__(self, article_path: str, skip_list: Optional[List[str]] = None, skip_all: bool = False,
-                 img_dir_name: str = 'images', img_public_path: str = ''):
-        self.img_dir_name = img_dir_name
-        self.img_public_path = img_public_path
+    def __init__(self, article_path: str, skip_list: Optional[List[str]] = None, skip_all_errors: bool = False,
+                 img_dir_name: str = 'images', img_public_path: str = '', downloading_timeout: float = -1,
+                 deduplication: bool = False):
+        self._img_dir_name = img_dir_name
+        self._img_public_path = img_public_path
         self._article_file_path = article_path
-        self._skip_list = sorted(skip_list) if skip_list is not None else []
-        self._images_dir = os.path.join(os.path.dirname(self._article_file_path), self.img_dir_name)
-        self._skip_all = skip_all
+        self._skip_list = set(skip_list) if skip_list is not None else []
+        self._images_dir = os.path.join(os.path.dirname(self._article_file_path), self._img_dir_name)
+        self._skip_all_errors = skip_all_errors
+        self._downloading_timeout = downloading_timeout
+        self._deduplication = deduplication
 
     def download_images(self, images: List[str]) -> dict:
         replacement_mapping = {}
+        hash_to_path_mapping = {}
         skip_list = self._skip_list
         img_count = len(images)
         path_join = os.path.join
-        img_dir_name = self.img_dir_name
-        img_public_path = self.img_public_path
+        img_dir_name = self._img_dir_name
+        img_public_path = self._img_public_path
         images_dir = self._images_dir
+        deduplication = self._deduplication
 
         try:
             os.makedirs(self._images_dir)
@@ -104,52 +110,93 @@ def download_images(self, images: List[str]) -> dict:
             print(f'Downloading image {img_num + 1} of {img_count} from "{img_url}"...')
 
             try:
-                img_response = ImageDownloader._download_image(img_url)
+                img_response = self._download_image(img_url)
             except Exception as e:
-                if self._skip_all:
+                if self._skip_all_errors:
                     print(f'Warning: can\'t download image {img_num + 1}, error: [{str(e)}], '
-                          'but processing will be continued, because `skip_all` flag is set')
+                          'but processing will be continued, because `skip_all_errors` flag is set')
                     continue
                 raise
 
             img_filename = get_filename_from_url(img_response)
-            img_path = path_join(images_dir, img_filename)
-            replacement_mapping.setdefault(img_url, path_join(img_public_path or img_dir_name, img_filename))
+            image_content = img_response.content
+
+            if deduplication:
+                # path_to_hash_mapping.
+                new_content_hash = hashlib.sha256(image_content).digest()
+                existed_file_name = hash_to_path_mapping.get(new_content_hash)
+                if existed_file_name is not None:
+                    img_filename = existed_file_name
+                    document_img_path = path_join(img_public_path or img_dir_name, img_filename)
+                    replacement_mapping.setdefault(img_url, document_img_path)
+                    continue
+                else:
+                    hash_to_path_mapping[new_content_hash] = img_filename
+
+            document_img_path = path_join(img_public_path or img_dir_name, img_filename)
+            img_filename, document_img_path = self._correct_paths(replacement_mapping, document_img_path, img_url,
+                                                                  img_filename)
+
+            real_img_path = path_join(images_dir, img_filename)
+            replacement_mapping.setdefault(img_url, document_img_path)
 
-            ImageDownloader._write_image(img_path, img_response.content)
+            ImageDownloader._write_image(real_img_path, image_content)
 
         return replacement_mapping
 
-    @staticmethod
-    def _download_image(image_url):
+    def _download_image(self, image_url: str):
+        """
+        Download image file from the URL.
+        :param image_url: URL to download.
+        """
+
         try:
-            img_response = requests.get(image_url, allow_redirects=True)
+            img_response = requests.get(image_url, allow_redirects=True, timeout=self._downloading_timeout)
         except requests.exceptions.SSLError:
             print('Incorrect SSL certificate, trying to download without verifying...')
-            img_response = requests.get(image_url, allow_redirects=True, verify=False)
+            img_response = requests.get(image_url, allow_redirects=True, verify=False,
+                                        timeout=self._downloading_timeout)
 
         if img_response.status_code != 200:
             raise OSError(str(img_response))
 
         return img_response
 
     @staticmethod
-    def _write_image(img_path, data):
+    def _write_image(img_path: str, data: bytes):
+        """
+        Write image data into the file.
+        """
+
         print(f'Image will be written to the file "{img_path}"...')
         with open(img_path, 'wb') as img_file:
             img_file.write(data)
             img_file.close()
 
-    def _is_allowed_url_prefix(self, url):
-        return url in self.allowed_url_prefixes
+    def _is_allowed_url_prefix(self, url: str) -> bool:
+        for prefix in self.allowed_url_prefixes:
+            if url.startswith(prefix):
+                return True
+
+        return False
+
+    def _correct_paths(self, replacement_mapping, document_img_path, img_url, img_filename):
+        # Images can have similar name, but different URLs, but I want to save original filename, if possible.
+        for url, path in replacement_mapping.items():
+            if document_img_path == path and img_url != url:
+                img_filename = f'{hashlib.md5(img_url.encode()).hexdigest()}_{img_filename}'
+                document_img_path = os.path.join(self._img_public_path or self._img_dir_name, img_filename)
+                break
+
+        return img_filename, document_img_path
 
 
 def main(arguments):
     """
     Entrypoint.
     """
 
-    article_file = os.path.expanduser(arguments.article_file_path)
+    article_path = os.path.expanduser(arguments.article_file_path)
     skip_list = arguments.skip_list
     skip_all = arguments.skip_all_incorrect
 
@@ -164,13 +211,15 @@ def main(arguments):
         else:
             skip_list = [s.strip() for s in skip_list.split(',')]
 
-    ArticleTransformer(article_file,
+    ArticleTransformer(article_path,
                        ImageDownloader(
-                           article_file,
-                           skip_list,
-                           skip_all,
-                           arguments.images_dirname,
-                           arguments.images_publicpath
+                           article_path=article_path,
+                           skip_list=skip_list,
+                           skip_all_errors=skip_all,
+                           img_dir_name=arguments.images_dirname,
+                           img_public_path=arguments.images_publicpath,
+                           downloading_timeout=arguments.downloading_timeout,
+                           deduplication=arguments.dedup_with_hash
                            )
                        ).run()
 
@@ -189,6 +238,10 @@ def main(arguments):
                         help='Public path to the folder of downloaded images')
     parser.add_argument('-a', '--skip-all-incorrect', default=False, action='store_true',
                         help='skip all incorrect images')
+    parser.add_argument('-t', '--downloading-timeout', type=float, default=-1,
+                        help='how many seconds to wait before downloading will be failed')
+    parser.add_argument('-D', '--dedup-with-hash', default=False, action='store_true',
+                        help='Deduplicate images, using content hash')
     parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}', help='return version number')
 
     args = parser.parse_args()

diff --git a/pkg/transformers/html/transformer.py b/pkg/transformers/html/transformer.py
@@ -0,0 +1,74 @@
+"""
+Images extractor from markdown document.
+"""
+
+from lxml import html
+from typing import List
+
+
+__all__ = ['ArticleTransformer']
+
+
+class ImgExtractor:
+    def run(self, doc):
+        """
+        Find all images in HTML.
+        """
+
+        tree = html.fromstring(doc)
+        images = tree.xpath('//img/@src')
+        # links = tree.xpath('//a/@href')
+
+        return images
+
+
+
+class ImgExtExtension(Extension):
+    def extendMarkdown(self, md, md_globals):
+        img_ext = ImgExtractor(md)
+        md.treeprocessors.register(img_ext, 'imgext', 20)
+
+
+class ArticleTransformer:
+    """
+    Markdown article transformation class.
+    """
+
+    def __init__(self, article_path: str, image_downloader):
+        self._image_downloader = image_downloader
+        self._article_file_path = article_path
+        self._md_conv = markdown.Markdown(extensions=[ImgExtExtension()])
+        self._replacement_mapping = {}
+
+    def _read_article(self) -> List[str]:
+        with open(self._article_file_path, 'r') as m_file:
+            self._md_conv.convert(m_file.read())
+
+        print(f'Images links count = {len(self._md_conv.images)}')
+        images = set(self._md_conv.images)
+        print(f'Unique images links count = {len(images)}')
+
+        return images
+
+    def _fix_document_urls(self) -> None:
+        print('Replacing images urls in the document...')
+        replacement_mapping = self._replacement_mapping
+        lines = []
+        with open(self._article_file_path, 'r') as infile:
+            for line in infile:
+                for src, target in replacement_mapping.items():
+                    line = line.replace(src, target)
+                lines.append(line)
+
+        with open(self._article_file_path, 'w') as outfile:
+            for line in lines:
+                outfile.write(line)
+
+    def run(self):
+        """
+        Run article conversion.
+        """
+
+        self._replacement_mapping = self._image_downloader.download_images(self._read_article())
+        self._fix_document_urls()
+
diff --git a/pkg/transformers/md/transformer.py b/pkg/transformers/md/transformer.py
@@ -71,4 +71,3 @@ def run(self):
 
         self._replacement_mapping = self._image_downloader.download_images(self._read_article())
         self._fix_document_urls()
-