diff --git a/python/pyphantomjs/mimesniffer.py b/python/pyphantomjs/mimesniffer.py new file mode 100644 index 000000000..98a714bfc --- /dev/null +++ b/python/pyphantomjs/mimesniffer.py @@ -0,0 +1,260 @@ +''' + This file is part of the PyPhantomJS project. + + Copyright (C) 2011 James Roe + + Big thanks to the Chromium Authors, as much of this code was derived + from their own hard work. :) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +''' + + +class Sniff(object): + def __init__(self, data): + # Use only 1024 bytes at maximum to compare against + data = data[:1024] + + # Returns: magic_type, magic, is_string + # :NOTE: Magic strings are case insensitive + magic_number = lambda type_, magic: (type_, magic, False) + magic_string = lambda type_, magic: (type_, magic, True) + + # HTML Tags + self._magic_tags = ( + # XML processing directive. Although this is not an HTML mime type, we sniff + # for this in the HTML phase because text/xml is just as powerful as HTML. + magic_string('text/xml', 'From'), + # Source: Chrome + magic_number('application/x-gzip', '\x1F\x8B\x08'), + magic_number('application/zip', 'PK\x03\x04'), + magic_number('application/x-rar-compressed', 'Rar!\x1A\x07\x00'), + magic_number('application/x-msmetafile', '\xD7\xCD\xC6\x9A'), + magic_number('application/octet-stream', 'MZ') # EXE + ) + + # Audio + self._magic_audio = ( + # Source: Chrome + # :TODO: we don't handle partial byte matches yet + # magic_number('audio/mpeg', '\xFF\xE'), + # magic_number('audio/mpeg', '\xFF\xF'), + magic_number('audio/x-pn-realaudio', '\x2E\x52\x4D\x46'), + magic_number('audio/mpeg', 'ID3') + ) + + # Video + self._magic_video = ( + # Source: Chrome + # :TODO: we don't handle partial byte matches yet + # magic_number('video/mpeg', '\x00\x00\x01\xB'), + magic_number('video/x-ms-asf', '\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C'), + magic_number('video/webm', '\x1A\x45\xDF\xA3') + ) + + # Byte order marks + self._magic_Bom = ( + magic_number('text/plain', '\xFE\xFF'), # UTF-16BE + magic_number('text/plain', '\xFF\xFE'), # UTF-16LE + magic_number('text/plain', '\xEF\xBB\xBF') # UTF-8 + ) + + # Whether a given byte looks like it might be part of binary content. + # Source: HTML5 specification + magic_byte_looks_binary = ( + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, # 0x00 - 0x0F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, # 0x10 - 0x1F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x20 - 0x2F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x30 - 0x3F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x40 - 0x4F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x50 - 0x5F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x60 - 0x6F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x70 - 0x7F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80 - 0x8F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90 - 0x9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0 - 0xAF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0 - 0xBF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xC0 - 0xCF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xD0 - 0xDF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xE0 - 0xEF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 # 0xF0 - 0xFF + ) + + # Start the auto-sniffing functionality + + # HTML sniffer; for this we will skip leading whitespace + if self._check_for_magic_numbers(data.lstrip(), self._magic_tags): + return + + # XML sniffer + if self._check_for_magic_numbers(data, self._magic_Xml): + return + + # Image sniffer + if self._check_for_magic_numbers(data, self._magic_images): + return + + # Magic Number sniffer + if self._check_for_magic_numbers(data, self._magic_numbers): + return + + # Audio sniffer + if self._check_for_magic_numbers(data, self._magic_audio): + return + + # Video sniffer + if self._check_for_magic_numbers(data, self._magic_video): + return + + # BOM sniffer; if we have a BOM, buffer is probably not binary content + if self._check_for_magic_numbers(data, self._magic_Bom): + return + + # Binary sniffer; checks if any bytes look binary + for i in range(len(data)): + if magic_byte_looks_binary[ord(data[i])]: + self.mime_type = 'application/octet-stream' + return + + # fall back to text/plain + self.mime_type = 'text/plain' + + # Magic helper methods + + def _check_for_magic_numbers(self, data, magic): + for i in range(len(magic)): + if self._match_magic_number(data, magic[i]): + return True + return False + + def _match_magic_number(self, data, magic_entry): + # we have a match unless explicitly set to False + match = True + + # if it's a string + if magic_entry[2]: + match = data.startswith(magic_entry[1]) or magic_entry[1] in data + else: + for i in range(len(magic_entry[1])): + if magic_entry[1][i] != '.' and magic_entry[1][i] != data[i]: + match = False + break + + if match: + self.mime_type = magic_entry[0] + return True + + return False + + # Public methods + + @property + def isAudio(self): + for magic_entry in self._magic_audio: + if self.mime_type == magic_entry[0]: + return True + return False + + @property + def isBinary(self): + return self.mime_type == 'application/octet-stream' + + @property + def isHtml(self): + for magic_entry in self._magic_tags: + if self.mime_type == magic_entry[0]: + return True + return False + + @property + def isImage(self): + for magic_entry in self._magic_images: + if self.mime_type == magic_entry[0]: + return True + return False + + @property + def isText(self): + return self.mime_type == 'text/plain' + + @property + def isVideo(self): + for magic_entry in self._magic_video: + if self.mime_type == magic_entry[0]: + return True + return False + + @property + def isXml(self): + for magic_entry in self._magic_Xml: + if self.mime_type == magic_entry[0]: + return True + return False diff --git a/python/pyphantomjs/webpage.py b/python/pyphantomjs/webpage.py index 62b01c572..2ec5f5c6b 100644 --- a/python/pyphantomjs/webpage.py +++ b/python/pyphantomjs/webpage.py @@ -30,6 +30,7 @@ from plugincontroller import do_action from utils import injectJsInFrame +from mimesniffer import Sniff class CustomPage(QWebPage): @@ -39,9 +40,10 @@ def __init__(self, parent): self.parent = parent self.m_userAgent = QWebPage.userAgentForUrl(self, QUrl()) self.m_scrollPosition = QPoint() - self.m_uploadFile = '' + self.setForwardUnsupportedContent(True) + do_action('CustomPageInit') def chooseFile(self, originatingFrame, oldFile): @@ -76,7 +78,7 @@ def __init__(self, parent): QObject.__init__(self, parent) # variable declarations - self.m_paperSize = {} + self.m_paperSize = self.m_replies = {} self.m_clipRect = QRect() self.m_libraryPath = '' self.m_mousePos = QPoint() @@ -88,6 +90,7 @@ def __init__(self, parent): self.m_mainFrame.javaScriptWindowObjectCleared.connect(self.initialized) self.m_webPage.loadStarted.connect(self.loadStarted) self.m_webPage.loadFinished.connect(self.finish) + self.m_webPage.unsupportedContent.connect(self.handleUnsupportedContent) # Start with transparent background palette = self.m_webPage.palette() @@ -129,6 +132,46 @@ def finish(self, ok): status = 'success' if ok else 'fail' self.loadFinished.emit(status) + def handleUnsupportedContent(self, reply): + def _onReady(): + sniffedReply = Sniff(str(reply.readAll())) + self.m_replies[reply] = sniffedReply.mime_type + + # reconnect mainFrame signal + if self.m_mainFrame.requestedUrl() == reply.url(): + self.m_webPage.loadFinished.connect(self.finish) + + valid_type = (sniffedReply.isText or sniffedReply.isHtml or sniffedReply.isXml or + sniffedReply.isImage) + if valid_type: + reply.finished.connect(lambda: _loopFrames(self.m_mainFrame)) + else: + # :TODO: file download implementation. In the meantime, abort the reply, + # and send the failed signal + reply.abort() + if self.m_mainFrame.requestedUrl() == reply.url(): + self.m_webPage.loadFinished.emit(False) + + def _loopFrames(frame): + for reply, mime_type in self.m_replies.items(): + if frame.requestedUrl() == reply.url(): + frame.setContent(reply.body(), mime_type, reply.url()) + else: + for frame in frame.childFrames(): + _loopFrames(frame) + + # make sure it's not a file we should download instead + if reply.rawHeader('Content-Disposition') != 'attachment': + # if no 'Content-Type' header is set in mainFrame, + # ignore loadFinished until the reply is done + if self.m_mainFrame.requestedUrl() == reply.url(): + self.m_webPage.loadFinished.disconnect(self.finish) + reply.readyRead.connect(_onReady) + else: + # :TODO: file download implementation. In the meantime, abort the reply, + # and send the failed signal + reply.abort() + def mainFrame(self): return self.m_mainFrame diff --git a/src/webpage.cpp b/src/webpage.cpp index 1660f71f0..dd71a23fb 100644 --- a/src/webpage.cpp +++ b/src/webpage.cpp @@ -46,6 +46,7 @@ #include #include "utils.h" +#include "networkreplyproxy.h" #include @@ -108,6 +109,7 @@ WebPage::WebPage(QObject *parent) connect(m_mainFrame, SIGNAL(javaScriptWindowObjectCleared()), SIGNAL(initialized())); connect(m_webPage, SIGNAL(loadStarted()), SIGNAL(loadStarted())); connect(m_webPage, SIGNAL(loadFinished(bool)), SLOT(finish(bool))); + connect(m_webPage, SIGNAL(unsupportedContent(QNetworkReply *)), SLOT(handleUnsupportedContent(QNetworkReply *))); // Start with transparent background. QPalette palette = m_webPage->palette(); @@ -151,6 +153,48 @@ void WebPage::setNetworkAccessManager(QNetworkAccessManager *networkAccessManage SIGNAL(resourceReceived(QVariant))); } +void WebPage::loopFrames(QWebFrame * frame) +{ + QNetworkReply *r; + foreach(r, replies) { + if(frame->requestedUrl() == r->url()) { + NetworkReplyProxy *nrp = qobject_cast(r); + frame->setHtml(nrp->body(), r->url()); + } else { + QWebFrame *f; + foreach(f, frame->childFrames()) { + loopFrames(f); + } + } + } +} + +void WebPage::unsupportedFinish() +{ + // Reconnect mainFrame signal + if(m_mainFrame->requestedUrl() == replies.last()->url()) + { + connect(m_webPage, SIGNAL(loadFinished(bool)), this, SLOT(finish(bool))); + } + loopFrames(m_mainFrame); +} + +void WebPage::handleUnsupportedContent(QNetworkReply *reply) +{ + // Make sure it's not a file we should download instead + if(reply->rawHeader("Content-Disposition") != "attachment") + { + replies << reply; + // If no 'Content-Type' header is set in mainFrame, + // ignore loadFinished until the reply is done + if(m_mainFrame->requestedUrl() == reply->url()) + { + disconnect(m_webPage, SIGNAL(loadFinished(bool)), this, SLOT(finish(bool))); + } + connect(reply, SIGNAL(finished()), SLOT(unsupportedFinish())); + } +} + QString WebPage::content() const { return m_mainFrame->toHtml(); diff --git a/src/webpage.h b/src/webpage.h index bac6864f7..f8b2e302b 100644 --- a/src/webpage.h +++ b/src/webpage.h @@ -34,6 +34,7 @@ #include #include #include +#include class CustomPage; class Phantom; @@ -95,6 +96,10 @@ public slots: private slots: void finish(bool ok); + void unsupportedFinish(); + +protected slots: + void handleUnsupportedContent(QNetworkReply *reply); private: CustomPage *m_webPage; @@ -103,10 +108,12 @@ private slots: QPoint m_scrollPosition; QVariantMap m_paperSize; // For PDF output via render() QString m_libraryPath; + QList replies; QImage renderImage(); bool renderPdf(const QString &fileName); void applySettings(const QVariantMap &defaultSettings); + void loopFrames(QWebFrame * frame); QString userAgent() const; void emitAlert(const QString &msg);