Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Amey Dhavle
committed
Nov 30, 2011
0 parents
commit 5b98eb4
Showing
11 changed files
with
346 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
|
||
<config charset="ISO-8859-1"> | ||
<include path="..//setup//functions.xml"/> | ||
|
||
<!-- Crawling definition begins here --> | ||
<var-def name="products"> | ||
<call name="download-multipage-list"> | ||
<call-param name="pageUrl">http://topsy.com/s?q=Nyquil</call-param> | ||
<call-param name="nextXPath">//a[starts-with(., 'next')]/@href</call-param> | ||
<call-param name="itemXPath">//span[@class="twitter-post-text translatable language-en"]</call-param> | ||
<call-param name="maxloops">10</call-param> | ||
</call> | ||
</var-def> | ||
|
||
<!-- iterates over all crawled items and extract desired data --> | ||
<file action="write" path="nyquil.xml" charset="UTF-8"> | ||
<![CDATA[ <reviews> ]]> | ||
<loop item="item" index="i"> | ||
<list><var name="products"/></list> | ||
<body> | ||
<xquery> | ||
<xq-param name="item" type="node()"><var name="item"/></xq-param> | ||
<xq-expression><![CDATA[ | ||
declare variable $item as node() external; | ||
let $desc := data($item//*[@class='twitter-post-text translatable language-en']) | ||
let $user := data($item//*[@class='author-name']) | ||
return | ||
<review> | ||
<comment>{normalize-space($desc)}</comment> | ||
<reviewer>{normalize-space($user)}</reviewer> | ||
</review> | ||
]]></xq-expression> | ||
</xquery> | ||
</body> | ||
</loop> | ||
<![CDATA[ </reviews> ]]> | ||
</file> | ||
|
||
</config> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
================== ML Crawler ================== | ||
This crawler wrapes the functionality provided by open-source crawler. | ||
It takes crawling instructions in xml format and returns returns the | ||
crawling results in XML format for ML Ingestion | ||
*** For internal use only *** | ||
|
||
Steps: | ||
1. Unzip the folder | ||
2. Folder Structure: | ||
a) ML-Crawler.jar: Jar file which wrapes the functionality provided | ||
by the open source crawler. | ||
b) configurations: | ||
setup files which should not be touched | ||
i) functions.xml | ||
ii) xquery.xml | ||
iii) crawler.xml | ||
Along with the setup files mentioned above, this folder is also the placeholder for the crawling instructions | ||
This folder is the place-holder for the crawling-instruction (e.g. sample-setup.xml). | ||
Crawler required the following items to be configured prior to executing, for specifying the items, you would need to do a 'view-source' on the html page | ||
|
||
<!-- ======== Defining the crawling premise =========== --> | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<config charset="ISO-8859-1"> | ||
<include path="..//setup//functions.xml"/> | ||
<!-- Crawling definition begins here --> | ||
<var-def name="products"> | ||
<call name="download-multipage-list"> | ||
<call-param name="pageUrl">http://topsy.com/s?q=Nyquil</call-param> <!-- Define the URL to be crawled --> | ||
<call-param name="nextXPath">//a[starts-with(., 'next')]/@href</call-param> <!-- Define the HTML pattern to determine the next page URL for paginated content --> | ||
<call-param name="itemXPath">//span[@class="twitter-post-text translatable language-en"]</call-param> <!-- Define the HTML pattern to capture desired content --> | ||
<call-param name="maxloops">10</call-param><!-- Looping variable to define the number of iterations crawler should perform --> | ||
</call> | ||
</var-def> | ||
<!-- ========== Building the result files =============== --> | ||
<!-- iterates over all crawled items and extract desired data --> | ||
<file action="write" path="nyquil.xml" charset="UTF-8"> <!-- Set the output filename --> | ||
|
||
<!-- build the basic doc structure --> | ||
<![CDATA[ <reviews> ]]> | ||
<loop item="item" index="i"> | ||
<list><var name="products"/></list> | ||
<body> | ||
<xquery> | ||
<xq-param name="item" type="node()"><var name="item"/></xq-param> | ||
<xq-expression><![CDATA[ | ||
declare variable $item as node() external; | ||
<!-- Specify the HTML pattern to capture the text --> | ||
let $desc := data($item//*[@class='twitter-post-text translatable language-en']) | ||
let $user := data($item//*[@class='author-name']) | ||
return | ||
<review> | ||
<comment>{normalize-space($desc)}</comment> | ||
<reviewer>{normalize-space($user)}</reviewer> | ||
</review> | ||
]]></xq-expression> | ||
</xquery> | ||
</body> | ||
</loop> | ||
<![CDATA[ </reviews> ]]> | ||
</file> | ||
</config> |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<config charset="ISO-8859-1"> | ||
<include path="functions.xml"/> | ||
<!-- collects all tables for individual products --> | ||
<var-def name="products"> | ||
<call name="download-multipage-list"> | ||
<call-param name="pageUrl">http://topsy.com/s?q=Nyquil</call-param> | ||
<call-param name="nextXPath">//a[starts-with(., 'next')]/@href</call-param> | ||
<call-param name="itemXPath">//span[@class="twitter-post-text translatable language-en"]</call-param> | ||
<call-param name="maxloops">10</call-param> | ||
</call> | ||
</var-def> | ||
|
||
<!-- iterates over all collected products and extract desired data --> | ||
<file action="write" path="canon/nyquil.xml" charset="UTF-8"> | ||
<![CDATA[ <reviews> ]]> | ||
<loop item="item" index="i"> | ||
<list><var name="products"/></list> | ||
<body> | ||
<xquery> | ||
<xq-param name="item" type="node()"><var name="item"/></xq-param> | ||
<xq-expression><![CDATA[ | ||
declare variable $item as node() external; | ||
let $desc := data($item//*[@class='twitter-post-text translatable language-en']) | ||
let $user := data($item//*[@class='author-name']) | ||
return | ||
<review> | ||
<comment>{normalize-space($desc)}</comment> | ||
<reviewer>{normalize-space($user)}</reviewer> | ||
</review> | ||
]]></xq-expression> | ||
</xquery> | ||
</body> | ||
</loop> | ||
<![CDATA[ </reviews> ]]> | ||
</file> | ||
|
||
</config> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
|
||
<config charset="UTF-8"> | ||
|
||
<!-- set initial page --> | ||
<var-def name="home">http://web-harvest.sourceforge.net/index.php</var-def> | ||
|
||
<!-- define script functions and variables --> | ||
<script><![CDATA[ | ||
/* checks if specified URL is valid for download */ | ||
boolean isValidUrl(String url) { | ||
String urlSmall = url.toLowerCase(); | ||
return urlSmall.startsWith("http://web-harvest.sourceforge.net/") && urlSmall.endsWith(".php"); | ||
} | ||
/* create filename based on specified URL */ | ||
String makeFilename(String url) { | ||
return url.replaceAll("http://|https://|file://", ""); | ||
} | ||
/* set of unvisited URLs */ | ||
Set unvisited = new HashSet(); | ||
unvisited.add(home); | ||
/* pushes to web-harvest context initial set of unvisited pages */ | ||
SetContextVar("unvisitedVar", unvisited); | ||
/* set of visited URLs */ | ||
Set visited = new HashSet(); | ||
]]></script> | ||
|
||
<!-- loop while there are any unvisited links --> | ||
<while condition="${unvisitedVar.toList().size() != 0}"> | ||
<loop item="currUrl"> | ||
<list><var name="unvisitedVar"/></list> | ||
<body> | ||
<empty> | ||
<var-def name="content"> | ||
<html-to-xml> | ||
<http url="${currUrl}"/> | ||
</html-to-xml> | ||
</var-def> | ||
|
||
<script><![CDATA[ | ||
currentFullUrl = sys.fullUrl(home, currUrl); | ||
]]></script> | ||
|
||
<!-- saves downloaded page --> | ||
<file action="write" path="spider/${makeFilename(currentFullUrl)}.html"> | ||
<var name="content"/> | ||
</file> | ||
|
||
<!-- adds current URL to the list of visited --> | ||
<script><![CDATA[ | ||
visited.add(sys.fullUrl(home, currUrl)); | ||
Set newLinks = new HashSet(); | ||
print(currUrl); | ||
]]></script> | ||
|
||
<!-- loop through all collected links on the downloaded page --> | ||
<loop item="currLink"> | ||
<list> | ||
<xpath expression="//a/@href"> | ||
<var name="content"/> | ||
</xpath> | ||
</list> | ||
<body> | ||
<script><![CDATA[ | ||
String fullLink = sys.fullUrl(home, currLink); | ||
if ( isValidUrl(fullLink.toString()) && !visited.contains(fullLink) && !unvisitedVar.toList().contains(fullLink) ) { | ||
newLinks.add(fullLink); | ||
} | ||
]]></script> | ||
</body> | ||
</loop> | ||
</empty> | ||
</body> | ||
</loop> | ||
|
||
<!-- unvisited link are now all the collected new links from downloaded pages --> | ||
<script><![CDATA[ | ||
SetContextVar("unvisitedVar", newLinks); | ||
]]></script> | ||
</while> | ||
|
||
</config> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
|
||
<config> | ||
<!-- | ||
Download multi-page list of items. | ||
@param pageUrl - URL of starting page | ||
@param itemXPath - XPath expression to obtain single item in the list | ||
@param nextXPath - XPath expression to URL for the next page | ||
@param maxloops - maximum number of pages downloaded | ||
@return list of all downloaded items | ||
--> | ||
<function name="download-multipage-list"> | ||
<return> | ||
<while condition="${pageUrl.toString().length() != 0}" maxloops="${maxloops}" index="i"> | ||
<empty> | ||
<var-def name="content"> | ||
<html-to-xml> | ||
<http url="${pageUrl}"/> | ||
</html-to-xml> | ||
</var-def> | ||
|
||
<var-def name="nextLinkUrl"> | ||
<xpath expression="${nextXPath}"> | ||
<var name="content"/> | ||
</xpath> | ||
</var-def> | ||
|
||
<var-def name="pageUrl"> | ||
<template>${sys.fullUrl(pageUrl.toString(), nextLinkUrl.toString())}</template> | ||
</var-def> | ||
</empty> | ||
|
||
<xpath expression="${itemXPath}"> | ||
<var name="content"/> | ||
</xpath> | ||
</while> | ||
</return> | ||
</function> | ||
</config> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
|
||
<config charset="UTF-8"> | ||
|
||
<file action="write" path="xquerytest.xml"> | ||
<xquery> | ||
<xq-param name="logicvalue" type="boolean">true</xq-param> | ||
<xq-param name="numbervalue" type="double">12</xq-param> | ||
<xq-param name="intseq" type="integer*"> | ||
3 | ||
7 | ||
14 | ||
18 | ||
27 | ||
</xq-param> | ||
<xq-param name="stringvalue" type="string">ABCDEFGH123456</xq-param> | ||
<xq-param name="doc"><![CDATA[ | ||
<table> | ||
<tr> | ||
<td>one</td> | ||
<td>two</td> | ||
<td>tree</td> | ||
</tr> | ||
<tr> | ||
<td>four</td> | ||
<td>five</td> | ||
<td>six</td> | ||
</tr> | ||
</table> | ||
]]></xq-param> | ||
|
||
<xq-expression><![CDATA[ | ||
(: example of external namespace and usage of Java function library inside the XQuery :) | ||
declare namespace math ="java:java.lang.Math"; | ||
(: declare namespace for user-defined functions :) | ||
declare namespace fn ="f:ns"; | ||
(: declaration of external variables with names and types matching to those in xq-param :) | ||
declare variable $logicvalue as xs:boolean external; | ||
declare variable $numbervalue as xs:double external; | ||
declare variable $intseq as xs:integer* external; | ||
declare variable $stringvalue as xs:string external; | ||
declare variable $doc as node()* external; | ||
(: user-defined function inside XQuery :) | ||
declare function fn:myFunc($items as xs:integer*) as node()* { | ||
for $num in $items return | ||
<value> | ||
{ math:sqrt($num) + 1 } | ||
</value> | ||
}; | ||
(: resuting XML :) | ||
<test> | ||
<logictest> | ||
{ if ($logicvalue) then 1 else 2 }, | ||
{ $logicvalue eq ($numbervalue gt 15) } | ||
</logictest> | ||
<doubletest> | ||
{ $numbervalue * 2 + 10 }, | ||
{ $numbervalue instance of xs:float }, | ||
{ round($numbervalue) } | ||
</doubletest> | ||
<stringtest> | ||
{ concat($stringvalue, $logicvalue, $numbervalue) } | ||
</stringtest> | ||
<numberseqtest> | ||
{ fn:myFunc($intseq) } | ||
{ concat($intseq[1], "mama") } | ||
</numberseqtest> | ||
<doctest> | ||
{ for $td in $doc//td return $td } | ||
</doctest> | ||
</test> | ||
]]></xq-expression> | ||
</xquery> | ||
</file> | ||
|
||
</config> |