forked from ukwa/ukwa-services
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add initial Crawl Log DB on SolrCloud setup.
- Loading branch information
Showing
5 changed files
with
72 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
Crawl Log DB | ||
============ | ||
|
||
The idea is to run a database of recent crawl log data, so we can see what's been going on. The log data could come from Kafka and/or the crawl log files and/or direct from Heritrix. | ||
|
||
This database is implemented as a SolrCloud collection. Using SolrCloud mode means the service will support the SQL query mode. | ||
|
||
For development, the scripts here can run a suitable SolrCloud server, setup the `crawl_log_fc` alias of a `crawl_log_fc_1` collection and add field definitions using Solr's Schema API. Using aliases means we can add and remove collections over time and so keep the size of the database manageable over time. | ||
|
||
The clients should use `<CRAWL_TIMESTAMP>:<URL>` as the record ID and send all data [as updates](https://lucene.apache.org/solr/guide/6_6/updating-parts-of-documents.html). As the ID parameters are fixed by Heritrix, this means we can update the fields from multiple sources, or the same source multiple times, and the results will be stable and correct (i.e. consistent and idempotent). _This is the TrackDB way._ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
|
||
version: '3.7' | ||
|
||
services: | ||
solrcloud: | ||
image: solr:8.7 | ||
#command: solr -f -cloud | ||
entrypoint: | ||
- bash | ||
- "-c" | ||
- "init-var-solr; exec solr -f -cloud" | ||
user: "${CURRENT_UID}" | ||
volumes: | ||
- "./cores:/var/solr" | ||
ports: | ||
- "8913:8983" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
{ | ||
"add-field":{ "name":"timestamp", "type":"pdate", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"url", "type":"string", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"host", "type":"string", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"http_method", "type":"string", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"status_code", "type":"plong", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"wire_bytes", "type":"plong", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"content_type", "type":"string", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"content_length", "type":"plong", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"content_digest", "type":"string", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"start_time_plus_duration", "type":"string", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"start_time", "type":"pdate", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"duration", "type":"plong", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"warc_filename", "type":"string", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"warc_offset", "type":"plong", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"warc_length", "type":"plong", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"warc_content_type", "type":"string", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"warc_type", "type":"string", "stored":true, "docValues":true }, | ||
"add-field":{ "name":"warc_id", "type":"string", "stored":true, "docValues":true } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
set -e | ||
|
||
SOLR_URL=http://localhost:8913/solr | ||
COLLECTION=crawl_log_fc_1 | ||
ALIAS=crawl_log_fc | ||
|
||
echo "Create collection..." | ||
curl "${SOLR_URL}/admin/collections?action=CREATE&name=${COLLECTION}&numShards=1&replicationFactor=1" | ||
|
||
|
||
echo "Add fields..." | ||
curl -X POST -H 'Content-type:application/json' --data-binary @schema_fields.json http://localhost:8913/solr/${COLLECTION}/schema | ||
|
||
echo "Create alias 'crawl_log_fc'..." | ||
curl "${SOLR_URL}/admin/collections?action=CREATEALIAS&name=${ALIAS}&collections=${COLLECTION}" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
export CURRENT_UID=$(id -u):$(id -g) | ||
|
||
echo $CURRENT_UID | ||
|
||
mkdir cores | ||
|
||
docker-compose -f docker-compose.dev.yml up -d solrcloud | ||
|