From f3e5a3158547ed32073f104b1bb37ff77e872376 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 10 Oct 2012 21:54:37 +0000 Subject: [PATCH] NUTCH-706 (applied correct patch) git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1396817 13f79535-47bb-0310-9956-ffa450edef68 --- conf/regex-normalize.xml.template | 2 +- .../sample/regex-normalize-default.test | 7 ++++++- .../urlnormalizer-regex/sample/regex-normalize-default.xml | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/conf/regex-normalize.xml.template b/conf/regex-normalize.xml.template index 4bc1611a6a..d14174eaac 100644 --- a/conf/regex-normalize.xml.template +++ b/conf/regex-normalize.xml.template @@ -29,7 +29,7 @@ - ([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$) + (?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&|#|$) $4 diff --git a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test index 09fb9188c7..dbb08b5680 100644 --- a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test +++ b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test @@ -11,8 +11,13 @@ http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://w http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1&another=2 http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2 http://www.foo.com/foo.html?param=1&another=2 http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 http://www.foo.com/foo.php?x=1&something=1 -# but NewsId is not a session id (NUTCH-706, NUTCH-1328) +http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 http://www.foo.com/foo.html +http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo +http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en +http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47 +# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328) http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 +http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 # test removal default pages http://www.foo.com/home/index.html http://www.foo.com/home/ diff --git a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml index 14e2b4fb8b..69a8eba26a 100644 --- a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml +++ b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml @@ -13,7 +13,7 @@ - ([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$) + (?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&|#|$) $4