Skip to content

Commit

Permalink
Merge branch 'feature/issue-2' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
shulard committed Feb 4, 2016
2 parents a92196e + 41a11b4 commit 9901365
Show file tree
Hide file tree
Showing 3 changed files with 210 additions and 54 deletions.
102 changes: 102 additions & 0 deletions src/Expression.php
@@ -0,0 +1,102 @@
<?php

namespace Bee4\RobotsTxt;

/**
* Class Expression
* Represent a matching expression rule
*
* @copyright Bee4 2016
* @author Stephane HULARD <s.hulard@chstudio.fr>
*/
class Expression
{
/**
* Raw definition
* @var string
*/
private $raw;

/**
* Rule pattern
* @var string
*/
private $pattern;

/**
* Initialize expression
* @param string $rule
*/
public function __construct($rule, $operator = self::ALLOW)
{
$this->raw = $rule;
}

/**
* Retrieve the raw rule definition
* @return string
*/
public function getRaw()
{
return $this->raw;
}

/**
* Transform current pattern to be used for matching
* @param string $raw
* @return string
*/
private function build()
{
$raw = $this->raw;

$ended = substr($raw, -1) === '$';
$raw = rtrim($raw, '*');
$raw = rtrim($raw, '$');

$parts = explode('*', $raw);
array_walk($parts, function (&$part) {
$part = preg_quote($part, '/');
});
return implode('.*', $parts).($ended?'':'.*');
}

/**
* Check if current expression is contained in another
* @param Expression $exp
* @return boolean
*/
public function contained(Expression $exp)
{
return $exp->contains($this);
}

/**
* Check if current expression contains another
* @param Expression $exp
* @return boolean
*/
public function contains(Expression $exp)
{
return preg_match('/^'.(string)$this.'$/', $exp->getRaw()) === 1;
}

/**
* Retrieve the regex pattern corresponding to the Expression
* @return string
*/
public function getPattern()
{
$this->pattern = $this->pattern ?: $this->build();
return $this->pattern;
}

/**
* Transform expression to string
* @return string
*/
public function __toString()
{
return $this->getPattern();
}
}
99 changes: 70 additions & 29 deletions src/Rule.php
Expand Up @@ -11,24 +11,41 @@
*/
class Rule
{
const COMPILED = 'compiled';
const DIRTY = 'dirty';

/**
* Rule status (compiled or dirty)
* @var string
*/
private $state;

/**
* The regex patterns that identidy if the rule match or not!
* Expression collection with allow / disallow segments
* @var array
*/
protected $patterns = [
'allow' => [],
private $exp = [
'allow' => [],
'disallow' => []
];

/**
* Compiled regex pattern with allow / disallow segments
* @var array
*/
private $patterns = [
'allow' => '',
'disallow' => ''
];

/**
* Add a pattern to match in the current rule by allowing
* @param string $pattern
* @return Rule
*/
public function allow($pattern)
{
$this->patterns['allow'][$pattern] = $this->handlePattern($pattern);
return $this;
return $this->addExpression(new Expression($pattern), 'allow');
}

/**
Expand All @@ -38,26 +55,41 @@ public function allow($pattern)
*/
public function disallow($pattern)
{
$this->patterns['disallow'][$pattern] = $this->handlePattern($pattern);
return $this->addExpression(new Expression($pattern), 'disallow');
}

/**
* Add an expression in the current rule
* @param string $pattern Expression raw pattern
* @param string $mode Expression mode (allow / disallow)
* @return Expression
*/
private function addExpression(Expression $exp, $mode)
{
$this->state = self::DIRTY;
$this->exp[$mode][] = $exp;
return $this;
}

/**
* Transform current pattern to be used for matching
* @param string $pattern
* @return string
* Compile expressions to a global pattern
* @return boolean
*/
private function handlePattern($pattern)
public function compile()
{
$ended = substr($pattern, -1) === '$';
$pattern = rtrim($pattern, '*');
$pattern = rtrim($pattern, '$');

$parts = explode('*', $pattern);
array_walk($parts, function (&$part) {
$part = preg_quote($part, '/');
});
return implode('.*', $parts).($ended?'':'.*');
if( self::COMPILED === $this->state ) {
return true;
}

$process = function(array &$patterns) {
usort($patterns, function($a, $b) {
return strlen($a->getRaw()) < strlen($b->getRaw());
});

return '/^(('.implode(')|(', $patterns).'))$/';
};
$this->patterns['allow'] = $process($this->exp['allow']);
$this->patterns['disallow'] = $process($this->exp['disallow']);
}

/**
Expand All @@ -67,19 +99,28 @@ private function handlePattern($pattern)
*/
public function match($url)
{
arsort($this->patterns['allow'], SORT_NUMERIC);
arsort($this->patterns['disallow'], SORT_NUMERIC);

$disallowed = implode('|', $this->patterns['disallow']);
if (count($this->patterns['disallow']) > 0 &&
preg_match('/^(?!('.$disallowed.')).*$/i', $url) !== 1 ) {
if (count($this->patterns['allow']) === 0) {
return false;
$this->compile();

if( 1 === preg_match($this->patterns['disallow'], $url, $disallowed) ) {
if( 1 === preg_match($this->patterns['allow'], $url, $allowed) ) {
$a = $this->lastFilledIndex($allowed);
$d = $this->lastFilledIndex($disallowed);
return strlen($this->exp['allow'][$a-2]->getRaw()) >= strlen($this->exp['disallow'][$d-2]->getRaw());
}

$allowed = implode('|', $this->patterns['allow']);
return preg_match('/^('.$allowed.')$/i', $url) === 1;
return false;
}

return true;
}

/**
* Retrieve the last filled index in a given array
* @param array $data
* @return integer
*/
private function lastFilledIndex(array $data)
{
return key( array_slice( array_filter($data), -1, 1, true ) );
}
}
63 changes: 38 additions & 25 deletions test/units/ParserTest.php
Expand Up @@ -22,40 +22,52 @@
*/
class ParserTest extends \PHPUnit_Framework_TestCase
{
protected $content = "User-agent: *
protected $content = "User-agent: *
Disallow: /mentions-legales/some-page.html
Disallow: /mentions-legales/
User-agent: google-bot
Allow: /truite.php
disallow: /";
disallow: /
protected $duplicateRuleContent = "User-agent: *
User-agent: bing
allow: /
disallow: /some-page.html";

protected $duplicateRuleContent = "User-agent: *
Disallow: /mentions-legales/
User-agent: *
Allow: /truite.php";

public function testParse() {
$content = new Content($this->content);
$rules = Parser::parse($content);
public function testParse()
{
$content = new Content($this->content);
$rules = Parser::parse($content);

$rule = $rules->get('*');
$this->assertInstanceOf('Bee4\RobotsTxt\Rule', $rule);
$rule = $rules->get('*');
$this->assertInstanceOf('Bee4\RobotsTxt\Rule', $rule);

$this->assertFalse($rule->match('/mentions-legales/'));
$this->assertTrue($rule->match('/another-page.html'));
$this->assertFalse($rule->match('/mentions-legales/'));
$this->assertFalse($rule->match('/mentions-legales/some-page.html'));
$this->assertFalse($rule->match('/mentions-legales/another-page.html'));
$this->assertTrue($rule->match('/another-page.html'));

$this->assertFalse($rules->match('Google-Bot v01', '/toto'));
$this->assertTrue($rules->match('Google-Bot v01', '/truite.php'));
}
$this->assertFalse($rules->match('Google-Bot v01', '/toto'));
$this->assertTrue($rules->match('Google-Bot v01', '/truite.php'));

public function testEmptyContentParse() {
$rules = Parser::parse("");
$this->assertTrue($rules->match('bing', '/toto'));
$this->assertFalse($rules->match('bing', '/some-page.html'));
}

$rule = $rules->get(Rules::DEFAULT_UA);
$this->assertInstanceOf('Bee4\RobotsTxt\Rule', $rule);
$this->assertTrue($rule->match('/another-page.html'));
}
public function testEmptyContentParse()
{
$rules = Parser::parse("");

$rule = $rules->get(Rules::DEFAULT_UA);
$this->assertInstanceOf('Bee4\RobotsTxt\Rule', $rule);
$this->assertTrue($rule->match('/another-page.html'));
}

/**
* @expectedException Bee4\RobotsTxt\Exception\DuplicateRuleException
Expand All @@ -64,11 +76,12 @@ public function testDuplicateRuleParse() {
Parser::parse($this->duplicateRuleContent);
}

public function testParserFactory() {
$content = ContentFactory::build('http://'.WEBSERVER_HOST.':'.WEBSERVER_PORT);
$this->assertInstanceOf('Bee4\RobotsTxt\Content', $content);
public function testParserFactory()
{
$content = ContentFactory::build('http://'.WEBSERVER_HOST.':'.WEBSERVER_PORT);
$this->assertInstanceOf('Bee4\RobotsTxt\Content', $content);

$rules = Parser::parse($content);
$this->assertInstanceOf('Bee4\RobotsTxt\Rule', $rules->get('*'));
}
$rules = Parser::parse($content);
$this->assertInstanceOf('Bee4\RobotsTxt\Rule', $rules->get('*'));
}
}

0 comments on commit 9901365

Please sign in to comment.