Merge cfcf96c into 2201745

airbnb · Oct 4, 2018 · 821c489 · 821c489
2 parents 2201745 + cfcf96c
commit 821c489
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 3 deletions.
diff --git a/knowledge_repo/converter.py b/knowledge_repo/converter.py
@@ -11,9 +11,16 @@
 
 def get_format(filename, format=None):
     if format is None:
-        format = os.path.splitext(filename)[1]
-        if format.startswith('.'):
-            format = format[1:]
+        if filename.startswith('https://docs.google.com/document/d/'):
+            format = 'gdoc'
+        elif filename.startswith('http://') or filename.startswith('https://'):
+            format = 'proxy'
+        elif '.' in filename:
+            format = os.path.splitext(filename)[1]
+            if format.startswith('.'):
+                format = format[1:]
+        else:
+            raise RuntimeError("Unable to determine a format automatically. Please manually specify the format, and try again.")
     return format
 
 

diff --git a/knowledge_repo/converters/docx.py b/knowledge_repo/converters/docx.py
@@ -0,0 +1,46 @@
+import os
+import re
+import shutil
+import tempfile
+
+from ..converter import KnowledgePostConverter
+
+
+class DocxConverter(KnowledgePostConverter):
+
+    _registry_keys = ['docx']
+
+    @property
+    def dependencies(self):
+        # Dependencies required for this converter on top of core knowledge-repo dependencies
+        return ['pypandoc']
+
+    def from_file(self, filename, **opts):
+        wd = tempfile.mkdtemp()
+        target_file = os.path.join(wd, 'post.md')
+        try:
+            import pypandoc
+
+            pypandoc.convert_file(
+                filename,
+                format='docx',
+                to='markdown',
+                outputfile=target_file,
+                extra_args=[
+                    '--standalone',
+                    '--wrap=none',
+                    '--extract-media={}'.format(wd)
+                ]
+            )
+
+            with open(target_file) as f:
+                md = f.read()
+
+            # Image embeddings exported from docx files have fixed sizes in inches
+            # which browsers do not understand. We remove these annotations.
+            md = re.sub('(\!\[\]\([^\)]+\))\{[^\}]+\}', lambda m: m.group(1), md)
+
+            # Write markdown content to knowledge post (images will be extracted later)
+            self.kp_write(md)
+        finally:
+            shutil.rmtree(wd)
diff --git a/knowledge_repo/converters/gdoc.py b/knowledge_repo/converters/gdoc.py
@@ -0,0 +1,53 @@
+import os
+import re
+import sys
+import time
+
+import webbrowser
+
+import knowledge_repo
+import subprocess
+
+from .docx import DocxConverter
+
+
+class GDocConverter(DocxConverter):
+    _registry_keys = ['gdoc']
+
+    def _find_doc(self, path, after=None):
+        count = 0
+        while count < 60:
+            count += 1
+            for filename in os.listdir(path):
+                if filename.endswith('.docx'):
+                    fpath = os.path.join(path, filename)
+                    if os.path.getmtime(fpath) > after:
+                        return fpath
+            time.sleep(1)
+        raise RuntimeError("Cannot find document.")
+
+    def from_file(self, url, download_path=None, **opts):
+        m = re.match('https://docs.google.com/document/d/(?P<doc_id>[^/]+)/', url)
+
+        if not m:
+            raise ValueError("Invalid Google Docs url.")
+
+        doc_id = m.group('doc_id')
+        download_url = "https://docs.google.com/document/d/{doc_id}/export?format=doc".format(doc_id=doc_id)
+
+        time_start = time.time()
+        webbrowser.open(download_url)
+
+        time.sleep(2)
+
+        download_path = download_path or os.path.expanduser('~/Downloads')
+        filename = self._find_doc(download_path, after=time_start)
+
+        DocxConverter.from_file(self, filename, **opts)
+
+        headers = self.kp.headers
+        if headers['title'].startswith('[]'):
+            headers['title'] = re.sub('\[\]\{[^\}]+\}', '', headers['title'])
+        if headers['subtitle'].startswith('[]'):
+            headers['subtitle'] = re.sub('\[\]\{[^\}]+\}', '', headers['subtitle'])
+        self.kp.update_headers(**headers)
diff --git a/knowledge_repo/converters/proxy.py b/knowledge_repo/converters/proxy.py
@@ -0,0 +1,20 @@
+import re
+
+from ..converter import KnowledgePostConverter
+
+
+class ProxyConverter(KnowledgePostConverter):
+
+    _registry_keys = ['proxy']
+
+    def from_file(self, url, **opts):
+
+        # Deal with special cases, whereby url should be mutated before being
+        # added to post headers.
+
+        # Google presentations should be embedded in "embed" mode.
+        gpres = re.match('^https://docs.google.com/presentation/d/(?P<pres_id>[^/]+)/edit(?P<slide_query>.*)$', url)
+        if gpres:
+            url = "https://docs.google.com/presentation/d/{}/embed{}".format(*gpres.groups())
+
+        self.kp_write("", headers={'proxy': url})