diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..380d2e2 Binary files /dev/null and b/.DS_Store differ diff --git a/ML-Crawler/.DS_Store b/ML-Crawler/.DS_Store new file mode 100644 index 0000000..1ff656f Binary files /dev/null and b/ML-Crawler/.DS_Store differ diff --git a/ML-Crawler/ML-Crawler.jar b/ML-Crawler/ML-Crawler.jar new file mode 100644 index 0000000..b3873f2 Binary files /dev/null and b/ML-Crawler/ML-Crawler.jar differ diff --git a/ML-Crawler/configurations/.DS_Store b/ML-Crawler/configurations/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/ML-Crawler/configurations/.DS_Store differ diff --git a/ML-Crawler/configurations/sample-setup.xml b/ML-Crawler/configurations/sample-setup.xml new file mode 100644 index 0000000..41b0b23 --- /dev/null +++ b/ML-Crawler/configurations/sample-setup.xml @@ -0,0 +1,40 @@ + + + + + + + + + http://topsy.com/s?q=Nyquil + //a[starts-with(., 'next')]/@href + //span[@class="twitter-post-text translatable language-en"] + 10 + + + + + + ]]> + + + + + + + {normalize-space($desc)} + {normalize-space($user)} + + ]]> + + + + ]]> + + + \ No newline at end of file diff --git a/ML-Crawler/readme.txt b/ML-Crawler/readme.txt new file mode 100644 index 0000000..0eec362 --- /dev/null +++ b/ML-Crawler/readme.txt @@ -0,0 +1,61 @@ +================== ML Crawler ================== +This crawler wrapes the functionality provided by open-source crawler. +It takes crawling instructions in xml format and returns returns the +crawling results in XML format for ML Ingestion + *** For internal use only *** + +Steps: +1. Unzip the folder +2. Folder Structure: + a) ML-Crawler.jar: Jar file which wrapes the functionality provided + by the open source crawler. + b) configurations: + setup files which should not be touched + i) functions.xml + ii) xquery.xml + iii) crawler.xml + Along with the setup files mentioned above, this folder is also the placeholder for the crawling instructions + This folder is the place-holder for the crawling-instruction (e.g. sample-setup.xml). + Crawler required the following items to be configured prior to executing, for specifying the items, you would need to do a 'view-source' on the html page + + + + + + + + + http://topsy.com/s?q=Nyquil + //a[starts-with(., 'next')]/@href + //span[@class="twitter-post-text translatable language-en"] + 10 + + + + + + + + ]]> + + + + + + + let $desc := data($item//*[@class='twitter-post-text translatable language-en']) + let $user := data($item//*[@class='author-name']) + return + + {normalize-space($desc)} + {normalize-space($user)} + + ]]> + + + + ]]> + + \ No newline at end of file diff --git a/ML-Crawler/sample/.DS_Store b/ML-Crawler/sample/.DS_Store new file mode 100644 index 0000000..7316d3b Binary files /dev/null and b/ML-Crawler/sample/.DS_Store differ diff --git a/ML-Crawler/sample/configurations/nyquil.xml b/ML-Crawler/sample/configurations/nyquil.xml new file mode 100644 index 0000000..13ce730 --- /dev/null +++ b/ML-Crawler/sample/configurations/nyquil.xml @@ -0,0 +1,38 @@ + + + + + + + http://topsy.com/s?q=Nyquil + //a[starts-with(., 'next')]/@href + //span[@class="twitter-post-text translatable language-en"] + 10 + + + + + + ]]> + + + + + + + {normalize-space($desc)} + {normalize-space($user)} + + ]]> + + + + ]]> + + + \ No newline at end of file diff --git a/ML-Crawler/setup/crawler.xml b/ML-Crawler/setup/crawler.xml new file mode 100644 index 0000000..430b22d --- /dev/null +++ b/ML-Crawler/setup/crawler.xml @@ -0,0 +1,86 @@ + + + + + + http://web-harvest.sourceforge.net/index.php + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ML-Crawler/setup/functions.xml b/ML-Crawler/setup/functions.xml new file mode 100644 index 0000000..4c53097 --- /dev/null +++ b/ML-Crawler/setup/functions.xml @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ML-Crawler/setup/xquery.xml b/ML-Crawler/setup/xquery.xml new file mode 100644 index 0000000..207b2a3 --- /dev/null +++ b/ML-Crawler/setup/xquery.xml @@ -0,0 +1,80 @@ + + + + + + + true + 12 + + 3 + 7 + 14 + 18 + 27 + + ABCDEFGH123456 + + + one + two + tree + + + four + five + six + + + ]]> + + + { math:sqrt($num) + 1 } + + }; + + (: resuting XML :) + + + { if ($logicvalue) then 1 else 2 }, + { $logicvalue eq ($numbervalue gt 15) } + + + { $numbervalue * 2 + 10 }, + { $numbervalue instance of xs:float }, + { round($numbervalue) } + + + { concat($stringvalue, $logicvalue, $numbervalue) } + + + { fn:myFunc($intseq) } + { concat($intseq[1], "mama") } + + + { for $td in $doc//td return $td } + + + ]]> + + + + \ No newline at end of file