Skip to content

Commit

Permalink
initial version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Amey Dhavle committed Nov 30, 2011
0 parents commit 5b98eb4
Show file tree
Hide file tree
Showing 11 changed files with 346 additions and 0 deletions.
Binary file added .DS_Store
Binary file not shown.
Binary file added ML-Crawler/.DS_Store
Binary file not shown.
Binary file added ML-Crawler/ML-Crawler.jar
Binary file not shown.
Binary file added ML-Crawler/configurations/.DS_Store
Binary file not shown.
40 changes: 40 additions & 0 deletions ML-Crawler/configurations/sample-setup.xml
@@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>

<config charset="ISO-8859-1">
<include path="..//setup//functions.xml"/>

<!-- Crawling definition begins here -->
<var-def name="products">
<call name="download-multipage-list">
<call-param name="pageUrl">http://topsy.com/s?q=Nyquil</call-param>
<call-param name="nextXPath">//a[starts-with(., 'next')]/@href</call-param>
<call-param name="itemXPath">//span[@class="twitter-post-text translatable language-en"]</call-param>
<call-param name="maxloops">10</call-param>
</call>
</var-def>

<!-- iterates over all crawled items and extract desired data -->
<file action="write" path="nyquil.xml" charset="UTF-8">
<![CDATA[ <reviews> ]]>
<loop item="item" index="i">
<list><var name="products"/></list>
<body>
<xquery>
<xq-param name="item" type="node()"><var name="item"/></xq-param>
<xq-expression><![CDATA[
declare variable $item as node() external;
let $desc := data($item//*[@class='twitter-post-text translatable language-en'])
let $user := data($item//*[@class='author-name'])
return
<review>
<comment>{normalize-space($desc)}</comment>
<reviewer>{normalize-space($user)}</reviewer>
</review>
]]></xq-expression>
</xquery>
</body>
</loop>
<![CDATA[ </reviews> ]]>
</file>

</config>
61 changes: 61 additions & 0 deletions ML-Crawler/readme.txt
@@ -0,0 +1,61 @@
================== ML Crawler ==================
This crawler wrapes the functionality provided by open-source crawler.
It takes crawling instructions in xml format and returns returns the
crawling results in XML format for ML Ingestion
*** For internal use only ***

Steps:
1. Unzip the folder
2. Folder Structure:
a) ML-Crawler.jar: Jar file which wrapes the functionality provided
by the open source crawler.
b) configurations:
setup files which should not be touched
i) functions.xml
ii) xquery.xml
iii) crawler.xml
Along with the setup files mentioned above, this folder is also the placeholder for the crawling instructions
This folder is the place-holder for the crawling-instruction (e.g. sample-setup.xml).
Crawler required the following items to be configured prior to executing, for specifying the items, you would need to do a 'view-source' on the html page

<!-- ======== Defining the crawling premise =========== -->
<?xml version="1.0" encoding="UTF-8"?>
<config charset="ISO-8859-1">
<include path="..//setup//functions.xml"/>
<!-- Crawling definition begins here -->
<var-def name="products">
<call name="download-multipage-list">
<call-param name="pageUrl">http://topsy.com/s?q=Nyquil</call-param> <!-- Define the URL to be crawled -->
<call-param name="nextXPath">//a[starts-with(., 'next')]/@href</call-param> <!-- Define the HTML pattern to determine the next page URL for paginated content -->
<call-param name="itemXPath">//span[@class="twitter-post-text translatable language-en"]</call-param> <!-- Define the HTML pattern to capture desired content -->
<call-param name="maxloops">10</call-param><!-- Looping variable to define the number of iterations crawler should perform -->
</call>
</var-def>
<!-- ========== Building the result files =============== -->
<!-- iterates over all crawled items and extract desired data -->
<file action="write" path="nyquil.xml" charset="UTF-8"> <!-- Set the output filename -->

<!-- build the basic doc structure -->
<![CDATA[ <reviews> ]]>
<loop item="item" index="i">
<list><var name="products"/></list>
<body>
<xquery>
<xq-param name="item" type="node()"><var name="item"/></xq-param>
<xq-expression><![CDATA[
declare variable $item as node() external;
<!-- Specify the HTML pattern to capture the text -->
let $desc := data($item//*[@class='twitter-post-text translatable language-en'])
let $user := data($item//*[@class='author-name'])
return
<review>
<comment>{normalize-space($desc)}</comment>
<reviewer>{normalize-space($user)}</reviewer>
</review>
]]></xq-expression>
</xquery>
</body>
</loop>
<![CDATA[ </reviews> ]]>
</file>
</config>
Binary file added ML-Crawler/sample/.DS_Store
Binary file not shown.
38 changes: 38 additions & 0 deletions ML-Crawler/sample/configurations/nyquil.xml
@@ -0,0 +1,38 @@
<?xml version="1.0" encoding="UTF-8"?>
<config charset="ISO-8859-1">
<include path="functions.xml"/>
<!-- collects all tables for individual products -->
<var-def name="products">
<call name="download-multipage-list">
<call-param name="pageUrl">http://topsy.com/s?q=Nyquil</call-param>
<call-param name="nextXPath">//a[starts-with(., 'next')]/@href</call-param>
<call-param name="itemXPath">//span[@class="twitter-post-text translatable language-en"]</call-param>
<call-param name="maxloops">10</call-param>
</call>
</var-def>

<!-- iterates over all collected products and extract desired data -->
<file action="write" path="canon/nyquil.xml" charset="UTF-8">
<![CDATA[ <reviews> ]]>
<loop item="item" index="i">
<list><var name="products"/></list>
<body>
<xquery>
<xq-param name="item" type="node()"><var name="item"/></xq-param>
<xq-expression><![CDATA[
declare variable $item as node() external;
let $desc := data($item//*[@class='twitter-post-text translatable language-en'])
let $user := data($item//*[@class='author-name'])
return
<review>
<comment>{normalize-space($desc)}</comment>
<reviewer>{normalize-space($user)}</reviewer>
</review>
]]></xq-expression>
</xquery>
</body>
</loop>
<![CDATA[ </reviews> ]]>
</file>

</config>
86 changes: 86 additions & 0 deletions ML-Crawler/setup/crawler.xml
@@ -0,0 +1,86 @@
<?xml version="1.0" encoding="UTF-8"?>

<config charset="UTF-8">

<!-- set initial page -->
<var-def name="home">http://web-harvest.sourceforge.net/index.php</var-def>

<!-- define script functions and variables -->
<script><![CDATA[
/* checks if specified URL is valid for download */
boolean isValidUrl(String url) {
String urlSmall = url.toLowerCase();
return urlSmall.startsWith("http://web-harvest.sourceforge.net/") && urlSmall.endsWith(".php");
}
/* create filename based on specified URL */
String makeFilename(String url) {
return url.replaceAll("http://|https://|file://", "");
}
/* set of unvisited URLs */
Set unvisited = new HashSet();
unvisited.add(home);
/* pushes to web-harvest context initial set of unvisited pages */
SetContextVar("unvisitedVar", unvisited);
/* set of visited URLs */
Set visited = new HashSet();
]]></script>

<!-- loop while there are any unvisited links -->
<while condition="${unvisitedVar.toList().size() != 0}">
<loop item="currUrl">
<list><var name="unvisitedVar"/></list>
<body>
<empty>
<var-def name="content">
<html-to-xml>
<http url="${currUrl}"/>
</html-to-xml>
</var-def>

<script><![CDATA[
currentFullUrl = sys.fullUrl(home, currUrl);
]]></script>

<!-- saves downloaded page -->
<file action="write" path="spider/${makeFilename(currentFullUrl)}.html">
<var name="content"/>
</file>

<!-- adds current URL to the list of visited -->
<script><![CDATA[
visited.add(sys.fullUrl(home, currUrl));
Set newLinks = new HashSet();
print(currUrl);
]]></script>

<!-- loop through all collected links on the downloaded page -->
<loop item="currLink">
<list>
<xpath expression="//a/@href">
<var name="content"/>
</xpath>
</list>
<body>
<script><![CDATA[
String fullLink = sys.fullUrl(home, currLink);
if ( isValidUrl(fullLink.toString()) && !visited.contains(fullLink) && !unvisitedVar.toList().contains(fullLink) ) {
newLinks.add(fullLink);
}
]]></script>
</body>
</loop>
</empty>
</body>
</loop>

<!-- unvisited link are now all the collected new links from downloaded pages -->
<script><![CDATA[
SetContextVar("unvisitedVar", newLinks);
]]></script>
</while>

</config>
41 changes: 41 additions & 0 deletions ML-Crawler/setup/functions.xml
@@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>

<config>
<!--
Download multi-page list of items.
@param pageUrl - URL of starting page
@param itemXPath - XPath expression to obtain single item in the list
@param nextXPath - XPath expression to URL for the next page
@param maxloops - maximum number of pages downloaded
@return list of all downloaded items
-->
<function name="download-multipage-list">
<return>
<while condition="${pageUrl.toString().length() != 0}" maxloops="${maxloops}" index="i">
<empty>
<var-def name="content">
<html-to-xml>
<http url="${pageUrl}"/>
</html-to-xml>
</var-def>

<var-def name="nextLinkUrl">
<xpath expression="${nextXPath}">
<var name="content"/>
</xpath>
</var-def>

<var-def name="pageUrl">
<template>${sys.fullUrl(pageUrl.toString(), nextLinkUrl.toString())}</template>
</var-def>
</empty>

<xpath expression="${itemXPath}">
<var name="content"/>
</xpath>
</while>
</return>
</function>
</config>
80 changes: 80 additions & 0 deletions ML-Crawler/setup/xquery.xml
@@ -0,0 +1,80 @@
<?xml version="1.0" encoding="UTF-8"?>

<config charset="UTF-8">

<file action="write" path="xquerytest.xml">
<xquery>
<xq-param name="logicvalue" type="boolean">true</xq-param>
<xq-param name="numbervalue" type="double">12</xq-param>
<xq-param name="intseq" type="integer*">
3
7
14
18
27
</xq-param>
<xq-param name="stringvalue" type="string">ABCDEFGH123456</xq-param>
<xq-param name="doc"><![CDATA[
<table>
<tr>
<td>one</td>
<td>two</td>
<td>tree</td>
</tr>
<tr>
<td>four</td>
<td>five</td>
<td>six</td>
</tr>
</table>
]]></xq-param>

<xq-expression><![CDATA[
(: example of external namespace and usage of Java function library inside the XQuery :)
declare namespace math ="java:java.lang.Math";
(: declare namespace for user-defined functions :)
declare namespace fn ="f:ns";
(: declaration of external variables with names and types matching to those in xq-param :)
declare variable $logicvalue as xs:boolean external;
declare variable $numbervalue as xs:double external;
declare variable $intseq as xs:integer* external;
declare variable $stringvalue as xs:string external;
declare variable $doc as node()* external;
(: user-defined function inside XQuery :)
declare function fn:myFunc($items as xs:integer*) as node()* {
for $num in $items return
<value>
{ math:sqrt($num) + 1 }
</value>
};
(: resuting XML :)
<test>
<logictest>
{ if ($logicvalue) then 1 else 2 },
{ $logicvalue eq ($numbervalue gt 15) }
</logictest>
<doubletest>
{ $numbervalue * 2 + 10 },
{ $numbervalue instance of xs:float },
{ round($numbervalue) }
</doubletest>
<stringtest>
{ concat($stringvalue, $logicvalue, $numbervalue) }
</stringtest>
<numberseqtest>
{ fn:myFunc($intseq) }
{ concat($intseq[1], "mama") }
</numberseqtest>
<doctest>
{ for $td in $doc//td return $td }
</doctest>
</test>
]]></xq-expression>
</xquery>
</file>

</config>

0 comments on commit 5b98eb4

Please sign in to comment.