Skip to content

Commit

Permalink
fix for target check for all formats
Browse files Browse the repository at this point in the history
move target check outside of all format classes and operating on the final parsed JSON so that it works regardless of the input content type
  • Loading branch information
aaronpk committed Nov 16, 2019
1 parent 1213ee0 commit 989d42a
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 55 deletions.
2 changes: 1 addition & 1 deletion controllers/Parse.php
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ public function parse(Request $request, Response $response) {
$data = [
'data' => $parsed['data'],
'url' => $result['url'],
'code' => $result['code']
'code' => $result['code'],
];
if(isset($parsed['info']))
$data['info'] = $parsed['info'];
Expand Down
32 changes: 0 additions & 32 deletions lib/XRay/Formats/Format.php
Original file line number Diff line number Diff line change
Expand Up @@ -104,36 +104,4 @@ protected static function stripHTML($html) {
return trim(str_replace(['<br>','<br />'],"\n", $sanitized));
}

protected static function findLinksInDocument(&$xpath, $target) {
$found = [];
self::xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
return $found;
}

public static function xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
$v = $el->getAttribute($attr);
$callback($v);
}
}

}
23 changes: 2 additions & 21 deletions lib/XRay/Formats/HTML.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ public static function parse($http, $http_response, $opts=[]) {
],
'url' => $url,
'code' => $http_response['code'],
'html' => $html,
];

// attempt to parse the page as HTML
Expand All @@ -45,26 +46,6 @@ public static function parse($http, $http_response, $opts=[]) {
}
}

// If a target parameter was provided, make sure a link to it exists on the page
if(isset($opts['target'])) {
$target = $opts['target'];

$found = [];
if($target) {
$found = self::findLinksInDocument($xpath, $target);
}

if(!$found) {
return [
'error' => 'no_link_found',
'error_description' => 'The source document does not have a link to the target URL',
'code' => isset($result['code']) ? $result['code'] : 200,
'url' => $url,
'debug' => $result
];
}
}

// If the URL has a fragment ID, find the DOM starting at that node and parse it instead
$fragment = parse_url($url, PHP_URL_FRAGMENT);
if($fragment) {
Expand Down Expand Up @@ -108,7 +89,7 @@ public static function parse($http, $http_response, $opts=[]) {
]);
// Skip and fall back to parsing the HTML if anything about this request fails
if(!$jsonpage['error'] && $jsonpage['body']) {
$jsondata = json_decode($jsonpage['body'],true);
$jsondata = json_decode($jsonpage['body'], true);
if($jsondata) {
$jsonpage['body'] = $jsondata;
$data = Formats\Mf2::parse($jsonpage, $http, $opts);
Expand Down
122 changes: 121 additions & 1 deletion lib/XRay/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
namespace p3k\XRay;

use p3k\XRay\Formats;
use DOMDocument, DOMXPath;

class Parser {
private $http;
Expand All @@ -11,6 +12,42 @@ public function __construct($http) {
}

public function parse($http_response, $opts=[]) {
$document = $this->parse_document($http_response, $opts);

// If a target parameter was provided, make sure a link to it exists in the parsed document
if(!isset($document['error']) && !empty($opts['target'])) {

if(isset($document['data']['type']) && $document['data']['type'] == 'unknown') {
if(isset($document['html'])) {
// Couldn't parse the page, check for the link manually assuming HTML content
$found = $this->_findLinkInHTML($opts['target'], $document['html']);
} else {
// Ignore this check for any non-HTML documents since this will be uncommon anyway
$found = false;
}
} else {
$found = $this->_findLinkInTree($opts['target'], $document['data']);
}

if(!$found) {
return [
'error' => 'no_link_found',
'error_description' => 'The source document does not have a link to the target URL',
'code' => isset($document['code']) ? $document['code'] : 200,
'url' => $document['url'],
'debug' => $document['data']
];
}
}

// If the HTML parser couldn't parse the page it returns the full HTML for checking the target above,
// but we don't want to return that in the out put so remove it here
unset($document['html']);

return $document;
}

public function parse_document($http_response, $opts=[]) {
if(isset($opts['timeout']))
$this->http->set_timeout($opts['timeout']);
if(isset($opts['max_redirects']))
Expand Down Expand Up @@ -46,8 +83,15 @@ public function parse($http_response, $opts=[]) {
$body = $http_response['body'];

// Check if an mf2 JSON object was passed in
if(is_array($body) && isset($body['items'][0]['type']) && isset($body['items'][0]['properties'])) {
if(is_array($body) && isset($body['items']) && isset($body['rels']) && isset($body['rel-urls'])) {
$data = Formats\Mf2::parse($http_response, $this->http, $opts);
if($data == false) {
$data = [
'data' => [
'type' => 'unknown',
]
];
}
$data['source-format'] = 'mf2+json';
return $data;
}
Expand Down Expand Up @@ -96,4 +140,80 @@ public function parse($http_response, $opts=[]) {
return $data;
}

private function _findLinkInTree($link, $document) {
if(!$document)
return false;

if(is_string($document) || is_numeric($document)) {
return $document == $link;
}

if(is_array($document)) {
foreach($document as $key=>$value) {
if($key === 'html') {
$found = $this->_findLinkInHTML($link, $value);
if($found) {
return true;
}
} else {
$found = $this->_findLinkInTree($link, $value);
if($found) {
return true;
}
}
}
return false;
}

throw new Exception('Unexpected value in tree');
}

private function _findLinkInHTML($link, $html) {
$doc = new DOMDocument();
@$doc->loadHTML(self::_toHtmlEntities($html));

if(!$doc)
return false;

$xpath = new DOMXPath($doc);

return self::_findLinksInDOMDocument($xpath, $link);
}

private static function _findLinksInDOMDocument(&$xpath, $target) {
$found = [];
self::_xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::_xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::_xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::_xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
return $found;
}

private static function _xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
$v = $el->getAttribute($attr);
$callback($v);
}
}

private static function _toHtmlEntities($input) {
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
}

}
30 changes: 30 additions & 0 deletions tests/LibraryTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,34 @@ public function testInputIsParsedMf2HCard() {
$this->assertEquals('Barnaby Walters', $data['data']['name']);
}

public function testNoHEntryMarkupMF2JSON() {
$url = 'http://example.com/';
$html = '<p><a href="http://target.example.com/">Target</a></p>';
$mf2 = Mf2\parse($html, $url);

$xray = new p3k\XRay();
$data = $xray->process($url, $mf2);
$this->assertEquals('unknown', $data['data']['type']);
}

public function testNoHEntryMarkup() {
$url = 'http://example.com/';
$html = '<p><a href="http://target.example.com/">Target</a></p>';

$xray = new p3k\XRay();
$data = $xray->parse($url, $html);
$this->assertEquals('unknown', $data['data']['type']);
}

public function testNoHEntryMarkupWithTarget() {
$url = 'http://example.com/';
$html = '<p><a href="http://target.example.com/">Target</a></p>';

$xray = new p3k\XRay();
$data = $xray->parse($url, $html, ['target' => 'http://target.example.com/']);
$this->assertEquals('unknown', $data['data']['type']);
$this->assertArrayNotHasKey('error', $data);
$this->assertArrayNotHasKey('html', $data);
}

}
66 changes: 66 additions & 0 deletions tests/ParseTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,19 @@ public function testTargetFound() {
$this->assertObjectNotHasAttribute('error', $data);
}

public function testTargetNotFoundInXML() {
$url = 'http://feed.example.com/atom';
$response = $this->parse(['url' => $url, 'target' => 'http://example.net']);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectHasAttribute('error', $data);
$this->assertEquals('no_link_found', $data->error);
$this->assertEquals('200', $data->code);
$this->assertEquals($url, $data->url);
}

public function testHTMLContent() {
$url = 'http://source.example.com/html-content';
$response = $this->parse(['url' => $url]);
Expand Down Expand Up @@ -217,6 +230,47 @@ public function testFindTargetLinkIsAudio() {
$this->assertEquals('This page has an audio tag with the target URL.', $data->data->content->text);
}

public function testFindTargetLinkInFeed() {
$url = 'http://feed.example.com/jsonfeed';
$response = $this->parse(['url' => $url, 'target' => 'http://www.manton.org/2017/11/5993.html']);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
}

public function testFindTargetLinkInHTMLInFeed() {
$url = 'http://feed.example.com/jsonfeed';
$response = $this->parse(['url' => $url, 'target' => 'http://www.manton.org/2016/11/todays-social-networks-are-broken.html']);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
}

public function testNotFindTargetLinkInHTMLInFeed() {
$url = 'http://feed.example.com/jsonfeed';
$response = $this->parse(['url' => $url, 'target' => 'http://example.com/']);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectHasAttribute('error', $data);
$this->assertEquals('no_link_found', $data->error);
}

public function testFindRelativeTargetLink() {
$url = 'http://source.example.com/multiple-urls';
$response = $this->parse(['url' => $url, 'target' => 'http://source.example.com/photo.jpg']);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
}

public function testTextContent() {
$url = 'http://source.example.com/text-content';
$response = $this->parse(['url' => $url]);
Expand Down Expand Up @@ -316,6 +370,18 @@ public function testNoHEntryMarkup() {
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('unknown', $data->data->type);
$this->assertObjectNotHasAttribute('html', $data);
}

public function testFindTargetInNoParsedResult() {
$url = 'http://source.example.com/no-h-entry';
$response = $this->parse(['url' => $url, 'target' => 'http://target.example.com']);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
$this->assertEquals('unknown', $data->data->type);
}

public function testReplyIsURL() {
Expand Down

0 comments on commit 989d42a

Please sign in to comment.