Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use xtractmime.mimegroups to determine response classes #2

Merged
merged 1 commit into from
Aug 2, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
25 changes: 23 additions & 2 deletions scrapy/responsetypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,15 @@

from xtractmime import RESOURCE_HEADER_BUFFER_LENGTH, extract_mime
from xtractmime._utils import contains_binary
from xtractmime.mimegroups import (
is_html_mime_type,
is_javascript_mime_type,
is_json_mime_type,
is_xml_mime_type,
)

from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Response
from scrapy.http import HtmlResponse, Response, TextResponse, XmlResponse
from scrapy.utils.misc import load_object
from scrapy.utils.python import binary_is_text, to_bytes, to_unicode

Expand Down Expand Up @@ -136,6 +142,21 @@ def _guess_content_type(self, headers=None, url=None, filename=None):

return None

def _guess_response_type(self, mime_type):
if not mime_type:
return Response
if is_html_mime_type(mime_type):
return HtmlResponse
if is_xml_mime_type(mime_type):
return XmlResponse
if (
mime_type.startswith(b'text/')
or is_json_mime_type(mime_type)
or is_javascript_mime_type(mime_type)
):
return TextResponse
return Response

def from_args(self, headers=None, url=None, filename=None, body=None):
"""Guess the most appropriate Response class based on
the given arguments."""
Expand All @@ -156,7 +177,7 @@ def from_args(self, headers=None, url=None, filename=None, body=None):
http_origin = not url or urlparse(url).scheme in ("http", "https")
content_types = self._guess_content_type(headers=headers, url=url, filename=filename)
mime_type = extract_mime(body, content_types=content_types, http_origin=http_origin)
cls = self.from_mimetype(mime_type.decode())
cls = self._guess_response_type(mime_type)

return cls

Expand Down