Permalink
Browse files

NUTCH-706 (applied correct patch)

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1396817 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent 965ecf5 commit f3e5a3158547ed32073f104b1bb37ff77e872376 @sebastian-nagel sebastian-nagel committed Oct 10, 2012
@@ -29,7 +29,7 @@
<!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
<regex>
- <pattern>([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
+ <pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
<substitution>$4</substitution>
</regex>
@@ -11,8 +11,13 @@ http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://w
http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1&another=2
http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2 http://www.foo.com/foo.html?param=1&another=2
http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 http://www.foo.com/foo.php?x=1&something=1
-# but NewsId is not a session id (NUTCH-706, NUTCH-1328)
+http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 http://www.foo.com/foo.html
+http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo
+http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en
+http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47
+# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328)
http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
+http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0
# test removal default pages
http://www.foo.com/home/index.html http://www.foo.com/home/
@@ -13,7 +13,7 @@
<!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
<regex>
- <pattern>([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
+ <pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
<substitution>$4</substitution>
</regex>

0 comments on commit f3e5a31

Please sign in to comment.