Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

updated added separate ingest and search scripts for the search examp…

…les, updated README
  • Loading branch information...
commit 5e2053f7d9dc45f14bb8e2f478ede344c8a005d7 1 parent 61c3052
@jt6211 jt6211 authored
View
34 README.md
@@ -83,13 +83,37 @@ Example use of Combiners for Analytics
python examples/analytics.py
-Example use Intersecting Iterator for search
+Example use Intersecting Iterator for search.
- python examples/intersecting_iterator.py
-
-Example use Document Intersecting Iterator for search
+ # index all the files in the pyaccumulo directory
+ $ python examples/intersecting_iterator/ingest.py ii_file_search *
+ Creating table: ii_file_search
+ indexing file examples/analytics.py
+ indexing file examples/regex_search.py
+ indexing file examples/simple.py
+ indexing file examples/indexed_doc_iterator/ingest.py
+ ...
+
+ # Now search the "ii_file_search" table for files that contain "assert_called_with" and "assertEquals"
+ python examples/intersecting_iterator/search.py ii_file_search assert_called_with assertEquals
+ tests/core_tests.py
+ tests/iterator_tests.py
+
+Example use Document Intersecting Iterator for search. This indexes the data in a slightly different way so the Iterator returns the document value as opposed to having to fetch it separately.
- python examples/doc_search.py
+ # index all the files in the pyaccumulo directory
+ $ python examples/indexed_doc_iterator/ingest.py dociter_file_search *
+ Creating table: dociter_file_search
+ indexing file examples/analytics.py
+ indexing file examples/regex_search.py
+ indexing file examples/simple.py
+ indexing file examples/indexed_doc_iterator/ingest.py
+ ...
+
+ # Now search the "dociter_file_search" table for files that contain "hashlib" and "search_terms"
+ python examples/indexed_doc_iterator/search.py dociter_file_search hashlib search_terms
+ examples/indexed_doc_iterator/search.py
+ examples/intersecting_iterator/search.py
Example use of Regex Filter for regex based searching
View
76 examples/indexed_doc_iterator/ingest.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pyaccumulo import Accumulo, Mutation, Range
+from pyaccumulo.iterators import *
+
+from pyaccumulo.proxy.ttypes import IteratorSetting, IteratorScope
+from util import hashcode
+import hashlib, re
+import settings
+import sys
+import os
+
+NUM_SHARDS=4
+
+def usage(msg=None):
+ print "Usage: %s <table> <dir1> [<dir2> <dir3> ...]"%sys.argv[0]
+ sys.exit(1)
+
+def get_uuid(filePath):
+ return hashlib.md5(filePath).hexdigest()
+
+def get_shard(uuid):
+ return "s%02d"% ((hashcode(uuid) & 0x0ffffffff)%NUM_SHARDS)
+
+def get_tokens(f):
+ return set([item for sublist in [re.split('[^\w]+', line.lower()) for line in f] for item in sublist if len(item) > 3])
+
+def write_mutations(writer, shard, uuid, value, tokens):
+ m = Mutation(shard)
+ m.put(cf="e\0file", cq=uuid, val=value)
+ for tok in tokens:
+ m.put(cf="i", cq="%s\0file\0%s\0info"%(tok, uuid), val="")
+ if len(m.updates) > 1000:
+ writer.add_mutation(m)
+ m = Mutation(shard)
+
+ if len(m.updates) > 0:
+ writer.add_mutation(m)
+
+try:
+ table = sys.argv[1]
+ input_dirs = sys.argv[2:]
+except:
+ usage()
+
+conn = Accumulo(host=settings.HOST, port=settings.PORT, user=settings.USER, password=settings.PASSWORD)
+
+if not conn.table_exists(table):
+ print "Creating table: %s"%table
+ conn.create_table(table)
+
+wr = conn.create_batch_writer(table)
+
+for indir in input_dirs:
+ for root, subFolders, files in os.walk(indir):
+ for filename in files:
+ filePath = os.path.join(root, filename)
+ print "indexing file %s"%filePath
+ uuid = get_uuid(filePath)
+ with open( filePath, 'r' ) as f:
+ write_mutations(wr, get_shard(uuid), uuid, filePath, get_tokens(f))
+wr.close()
View
41 examples/intersecting_iterator.py → examples/indexed_doc_iterator/search.py
@@ -21,40 +21,21 @@
from util import hashcode
import hashlib, re
import settings
+import sys
conn = Accumulo(host=settings.HOST, port=settings.PORT, user=settings.USER, password=settings.PASSWORD)
-table = "search"
-if conn.table_exists(table):
- conn.delete_table(table)
-conn.create_table(table)
+table = sys.argv[1]
+if not conn.table_exists(table):
+ print "Table '%s' does not exist."%table
+ sys.exit(1)
-wr = conn.create_batch_writer(table)
+search_terms = [term.lower() for term in sys.argv[2:] if len(term) > 3]
-license_file = "LICENSE"
-linenum = 0
-
-with file(license_file) as infile:
- for line in infile:
- linenum += 1
- line = line.strip()
- uuid = str(linenum)
-
- m = Mutation(uuid)
- m.put(cf="e", cq="", val=line)
- wr.add_mutation(m)
-
- m = Mutation("s%02d"% ((hashcode(uuid) & 0x0ffffffff)%4))
- for tok in set(re.split('[\W]+', line.lower())):
- m.put(tok, cq=uuid, val="")
- wr.add_mutation(m)
-wr.close()
-
-uuids = []
-for e in conn.batch_scan(table, scanranges=[Range(srow="s0", erow="s1")], iterators=[IntersectingIterator(priority=21, terms=["software", "source", "code"])]):
- uuids.append(e.cq)
-
-for doc in conn.batch_scan(table, scanranges=[Range(srow=uuid, erow=uuid) for uuid in uuids]):
- print doc
+if len(search_terms) < 2:
+ print "More than one term of length > 3 is required for this example"
+ sys.exit(1)
+for e in conn.batch_scan(table, iterators=[IndexedDocIterator(priority=21, terms=search_terms)]):
+ print e.val
conn.close()
View
79 examples/intersecting_iterator/ingest.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pyaccumulo import Accumulo, Mutation, Range
+from pyaccumulo.iterators import *
+
+from pyaccumulo.proxy.ttypes import IteratorSetting, IteratorScope
+from util import hashcode
+import hashlib, re
+import settings
+import sys
+import os
+
+NUM_SHARDS=4
+
+def usage(msg=None):
+ print "Usage: %s <table> <dir1> [<dir2> <dir3> ...]"%sys.argv[0]
+ sys.exit(1)
+
+def get_uuid(filePath):
+ return hashlib.md5(filePath).hexdigest()
+
+def get_shard(uuid):
+ return "s%02d"% ((hashcode(uuid) & 0x0ffffffff)%NUM_SHARDS)
+
+def get_tokens(f):
+ return set([item for sublist in [re.split('[^\w]+', line.lower()) for line in f] for item in sublist if len(item) > 3])
+
+def write_mutations(writer, shard, uuid, value, tokens):
+ m = Mutation(uuid)
+ m.put(cf="e", cq="", val=value)
+ writer.add_mutation(m)
+
+ m = Mutation(shard)
+ for tok in tokens:
+ m.put(tok, cq=uuid, val="")
+ if len(m.updates) > 1000:
+ writer.add_mutation(m)
+ m = Mutation(shard)
+
+ if len(m.updates) > 0:
+ writer.add_mutation(m)
+
+try:
+ table = sys.argv[1]
+ input_dirs = sys.argv[2:]
+except:
+ usage()
+
+conn = Accumulo(host=settings.HOST, port=settings.PORT, user=settings.USER, password=settings.PASSWORD)
+
+if not conn.table_exists(table):
+ print "Creating table: %s"%table
+ conn.create_table(table)
+
+wr = conn.create_batch_writer(table)
+
+for indir in input_dirs:
+ for root, subFolders, files in os.walk(indir):
+ for filename in files:
+ filePath = os.path.join(root, filename)
+ print "indexing file %s"%filePath
+ uuid = get_uuid(filePath)
+ with open( filePath, 'r' ) as f:
+ write_mutations(wr, get_shard(uuid), uuid, filePath, get_tokens(f))
+wr.close()
View
37 examples/doc_search.py → examples/intersecting_iterator/search.py
@@ -21,32 +21,29 @@
from util import hashcode
import hashlib, re
import settings
+import sys
conn = Accumulo(host=settings.HOST, port=settings.PORT, user=settings.USER, password=settings.PASSWORD)
-table = "doc_search"
-if conn.table_exists(table):
- conn.delete_table(table)
-conn.create_table(table)
+table = sys.argv[1]
+if not conn.table_exists(table):
+ print "Table '%s' does not exist."%table
+ sys.exit(1)
-wr = conn.create_batch_writer(table)
+search_terms = [term.lower() for term in sys.argv[2:] if len(term) > 3]
-license_file = "LICENSE"
-linenum = 0
+if len(search_terms) < 2:
+ print "More than one term of length > 3 is required for this example"
+ sys.exit(1)
-with file(license_file) as infile:
- for line in infile:
- linenum += 1
- line = line.strip()
- uuid = str(linenum)
+uuids = []
+for e in conn.batch_scan(table, scanranges=[Range(srow="s", erow="t")], iterators=[IntersectingIterator(priority=21, terms=search_terms)]):
+ uuids.append(e.cq)
- m = Mutation("s%02d"% ((hashcode(uuid) & 0x0ffffffff)%4))
- m.put(cf="e\0license", cq=uuid, val=line)
- for tok in set(re.split('[^\w.]+', line.lower())):
- m.put(cf="i", cq="%s\0license\0%s\0info"%(tok, uuid), val="")
- wr.add_mutation(m)
-wr.close()
+if len(uuids) > 0:
+ for doc in conn.batch_scan(table, scanranges=[Range(srow=uuid, erow=uuid) for uuid in uuids]):
+ print doc.val
+else:
+ print "No results found"
-for e in conn.batch_scan(table, scanranges=[Range(srow="s0", erow="s1")], iterators=[IndexedDocIterator(priority=21, terms=["derived", "from"])]):
- print e
conn.close()
Please sign in to comment.
Something went wrong with that request. Please try again.