Skip to content

Commit

Permalink
fixed: filenames are corrupted
Browse files Browse the repository at this point in the history
Most servers now output filenames in UTF-8 URL-encoded. But FlexGet
decodes to latin-1.
Let's allow to define filename encoding with a config 'encoding'.
  • Loading branch information
ashumkin committed Apr 4, 2016
1 parent 32d410e commit 900bd35
Showing 1 changed file with 14 additions and 12 deletions.
26 changes: 14 additions & 12 deletions flexget/plugins/output/download.py 100755 → 100644
Expand Up @@ -58,6 +58,7 @@ class PluginDownload(object):
'type': 'object',
'properties': {
'path': {'type': 'string', 'format': 'path'},
'encoding': {'type': 'string'},
'fail_html': {'type': 'boolean', 'default': True},
'overwrite': {'type': 'boolean', 'default': False},
'temp': {'type': 'string', 'format': 'path'}
Expand All @@ -78,6 +79,7 @@ def process_config(self, config):
if not config.get('path'):
config['require_path'] = True
config.setdefault('fail_html', True)
config.setdefault('encoding', 'latin1')
return config

def on_task_download(self, task, config):
Expand All @@ -87,10 +89,10 @@ def on_task_download(self, task, config):
tmp = config.get('temp', os.path.join(task.manager.config_base, 'temp'))

self.get_temp_files(task, require_path=config.get('require_path', False), fail_html=config['fail_html'],
tmp_path=tmp)
tmp_path=tmp, encoding=config['encoding'])

def get_temp_file(self, task, entry, require_path=False, handle_magnets=False, fail_html=True,
tmp_path=tempfile.gettempdir()):
tmp_path=tempfile.gettempdir(), encoding='latin1'):
"""
Download entry content and store in temporary folder.
Fails entry with a reason if there was problem.
Expand Down Expand Up @@ -125,7 +127,7 @@ def get_temp_file(self, task, entry, require_path=False, handle_magnets=False, f
# Don't fail here, there might be a magnet later in the list of urls
log.debug('Skipping url %s because there is no path for download' % url)
continue
error = self.process_entry(task, entry, url, tmp_path)
error = self.process_entry(task, entry, url, tmp_path, encoding)

# disallow html content
html_mimes = ['html', 'text/html']
Expand Down Expand Up @@ -158,7 +160,7 @@ def save_error_page(self, entry, task, page):
outfile.write(page)

def get_temp_files(self, task, require_path=False, handle_magnets=False, fail_html=True,
tmp_path=tempfile.gettempdir()):
tmp_path=tempfile.gettempdir(), encoding='latin1'):
"""Download all task content and store in temporary folder.
:param bool require_path:
Expand All @@ -172,10 +174,10 @@ def get_temp_files(self, task, require_path=False, handle_magnets=False, fail_ht
path to use for temporary files while downloading
"""
for entry in task.accepted:
self.get_temp_file(task, entry, require_path, handle_magnets, fail_html, tmp_path)
self.get_temp_file(task, entry, require_path, handle_magnets, fail_html, tmp_path, encoding)

# TODO: a bit silly method, should be get rid of now with simplier exceptions ?
def process_entry(self, task, entry, url, tmp_path):
def process_entry(self, task, entry, url, tmp_path, encoding):
"""
Processes `entry` by using `url`. Does not use entry['url'].
Does not fail the `entry` if there is a network issue, instead just logs and returns a string error.
Expand All @@ -192,7 +194,7 @@ def process_entry(self, task, entry, url, tmp_path):
else:
if not task.manager.unit_test:
log.info('Downloading: %s' % entry['title'])
self.download_entry(task, entry, url, tmp_path)
self.download_entry(task, entry, url, tmp_path, encoding)
except RequestException as e:
log.warning('RequestException %s, while downloading %s' % (e, url))
return 'Network error during request: %s' % e
Expand All @@ -213,7 +215,7 @@ def process_entry(self, task, entry, url, tmp_path):
log.debug(msg, exc_info=True)
return msg

def download_entry(self, task, entry, url, tmp_path):
def download_entry(self, task, entry, url, tmp_path, encoding):
"""Downloads `entry` by using `url`.
:raises: Several types of exceptions ...
Expand Down Expand Up @@ -305,7 +307,7 @@ def download_entry(self, task, entry, url, tmp_path):
# prefer content-disposition naming, note: content-disposition can be disabled completely
# by setting entry field `content-disposition` to False
if entry.get('content-disposition', True):
self.filename_from_headers(entry, response)
self.filename_from_headers(entry, response, encoding)
else:
log.info('Content-disposition disabled for %s' % entry['title'])
self.filename_ext_from_mime(entry)
Expand All @@ -316,7 +318,7 @@ def download_entry(self, task, entry, url, tmp_path):
entry['filename'] = filename
log.debug('Finishing download_entry() with filename %s' % entry.get('filename'))

def filename_from_headers(self, entry, response):
def filename_from_headers(self, entry, response, encoding):
"""Checks entry filename if it's found from content-disposition"""
if not response.headers.get('content-disposition'):
# No content disposition header, nothing we can do
Expand All @@ -326,8 +328,8 @@ def filename_from_headers(self, entry, response):
if filename:
# try to decode to unicode, specs allow latin1, some may do utf-8 anyway
try:
filename = filename.decode('latin1')
log.debug('filename header latin1 decoded')
filename = filename.decode(encoding)
log.debug('filename header %s decoded' % encoding)
except UnicodeError:
try:
filename = filename.decode('utf-8')
Expand Down

0 comments on commit 900bd35

Please sign in to comment.