Skip to content

Commit

Permalink
Merge cfcf96c into 2201745
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewwardrop committed Oct 4, 2018
2 parents 2201745 + cfcf96c commit 821c489
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 3 deletions.
13 changes: 10 additions & 3 deletions knowledge_repo/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,16 @@

def get_format(filename, format=None):
if format is None:
format = os.path.splitext(filename)[1]
if format.startswith('.'):
format = format[1:]
if filename.startswith('https://docs.google.com/document/d/'):
format = 'gdoc'
elif filename.startswith('http://') or filename.startswith('https://'):
format = 'proxy'
elif '.' in filename:
format = os.path.splitext(filename)[1]
if format.startswith('.'):
format = format[1:]
else:
raise RuntimeError("Unable to determine a format automatically. Please manually specify the format, and try again.")
return format


Expand Down
46 changes: 46 additions & 0 deletions knowledge_repo/converters/docx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import re
import shutil
import tempfile

from ..converter import KnowledgePostConverter


class DocxConverter(KnowledgePostConverter):

_registry_keys = ['docx']

@property
def dependencies(self):
# Dependencies required for this converter on top of core knowledge-repo dependencies
return ['pypandoc']

def from_file(self, filename, **opts):
wd = tempfile.mkdtemp()
target_file = os.path.join(wd, 'post.md')
try:
import pypandoc

pypandoc.convert_file(
filename,
format='docx',
to='markdown',
outputfile=target_file,
extra_args=[
'--standalone',
'--wrap=none',
'--extract-media={}'.format(wd)
]
)

with open(target_file) as f:
md = f.read()

# Image embeddings exported from docx files have fixed sizes in inches
# which browsers do not understand. We remove these annotations.
md = re.sub('(\!\[\]\([^\)]+\))\{[^\}]+\}', lambda m: m.group(1), md)

# Write markdown content to knowledge post (images will be extracted later)
self.kp_write(md)
finally:
shutil.rmtree(wd)
53 changes: 53 additions & 0 deletions knowledge_repo/converters/gdoc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import re
import sys
import time

import webbrowser

import knowledge_repo
import subprocess

from .docx import DocxConverter


class GDocConverter(DocxConverter):
_registry_keys = ['gdoc']

def _find_doc(self, path, after=None):
count = 0
while count < 60:
count += 1
for filename in os.listdir(path):
if filename.endswith('.docx'):
fpath = os.path.join(path, filename)
if os.path.getmtime(fpath) > after:
return fpath
time.sleep(1)
raise RuntimeError("Cannot find document.")

def from_file(self, url, download_path=None, **opts):
m = re.match('https://docs.google.com/document/d/(?P<doc_id>[^/]+)/', url)

if not m:
raise ValueError("Invalid Google Docs url.")

doc_id = m.group('doc_id')
download_url = "https://docs.google.com/document/d/{doc_id}/export?format=doc".format(doc_id=doc_id)

time_start = time.time()
webbrowser.open(download_url)

time.sleep(2)

download_path = download_path or os.path.expanduser('~/Downloads')
filename = self._find_doc(download_path, after=time_start)

DocxConverter.from_file(self, filename, **opts)

headers = self.kp.headers
if headers['title'].startswith('[]'):
headers['title'] = re.sub('\[\]\{[^\}]+\}', '', headers['title'])
if headers['subtitle'].startswith('[]'):
headers['subtitle'] = re.sub('\[\]\{[^\}]+\}', '', headers['subtitle'])
self.kp.update_headers(**headers)
20 changes: 20 additions & 0 deletions knowledge_repo/converters/proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import re

from ..converter import KnowledgePostConverter


class ProxyConverter(KnowledgePostConverter):

_registry_keys = ['proxy']

def from_file(self, url, **opts):

# Deal with special cases, whereby url should be mutated before being
# added to post headers.

# Google presentations should be embedded in "embed" mode.
gpres = re.match('^https://docs.google.com/presentation/d/(?P<pres_id>[^/]+)/edit(?P<slide_query>.*)$', url)
if gpres:
url = "https://docs.google.com/presentation/d/{}/embed{}".format(*gpres.groups())

self.kp_write("", headers={'proxy': url})

0 comments on commit 821c489

Please sign in to comment.