Skip to content

Commit

Permalink
separated ngrams and cooc into mongodb
Browse files Browse the repository at this point in the history
  • Loading branch information
elishowk committed Jan 31, 2011
1 parent 943fb55 commit 9c98d45
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 24 deletions.
38 changes: 14 additions & 24 deletions cableextractor.py
Expand Up @@ -58,38 +58,28 @@ def update_cable_cooc(self, cable):
cooccache={}
for ng1, ng2 in itertools.combinations(cable['edges']['NGram'].keys(), 2):
coocid12 = ng1+"_"+ng2
coocid21 = ng2+"_"+ng1
if coocid12 not in cooccache:
if coocid21 not in cooccache:
cooccache[coocid12] = self.mongodb.cooc.find_one({'_id': coocid12})
if cooccache[coocid12] is None:
del cooccache[coocid12]
cooccache[coocid21] = self.mongodb.cooc.find_one({'_id': coocid21})
if cooccache[coocid21] is None:
del cooccache[coocid21]
### nothing in cache nor in mongo
cooccache[coocid12] = { '_id': coocid12 }
cooccache[coocid12] = addEdge(cooccache[coocid12], "NGram", ng2, 1)
else:
### new in cache but was in mongo
cooccache[coocid21] = addEdge(cooccache[coocid21], "NGram", ng1, 1)
else:
cooccache[coocid12] = addEdge(cooccache[coocid12], "NGram", ng2, 1)
cooc12 = self.mongodb.cooc.find_one({'_id': coocid12})
if cooc12 is None:
coocid21 = ng2+"_"+ng1
cooc21 = self.mongodb.cooc.find_one({'_id': coocid21})
if cooc21 is None:
cooc12 = { '_id': coocid12 }
cooc12 = addEdge(coocid12, "NGram", ng2, 1)
self.mongodb.cooc.save(cooc12)
continue
else:
cooccache[coocid21] = addEdge(cooccache[coocid21], "NGram", ng1, 1)
else:
cooccache[coocid12] = addEdge(cooccache[coocid12], "NGram", ng2, 1)

for cooc in cooccache.itervalues():
self.mongodb.cooc.save(cooc)
coocid21 = addEdge(coocid21, "NGram", ng1, 1)
self.mongodb.cooc.save(cooc21)
continue

def extract(self, ngramizer, filters, postagger, overwrite):
"""
gets the all cables from storage then extract n-grams and produce networks edges and weights
"""
if overwrite is True and "ngrams" in self.mongodb.collection_names():
self.mongodb.ngrams.remove()

if overwrite is True and "cooc" in self.mongodb.collection_names():
self.mongodb.cooc.remove()
for cable in self.mongodb.cables.find(timeout=False):
if cable is None:
logging.warning("cable %d not found in the database, skipping"%cable_id)
Expand Down
1 change: 1 addition & 0 deletions cabletokenizer.py
Expand Up @@ -161,6 +161,7 @@ def ngramize(self, document, minSize, maxSize, tagTokens, filters, stemmer):
"""
documentnode = get_node(self.graphdb, document['_id'])
if documentnode is None:
del document['content']
documentnode = add_node(self.graphdb, document)
# content is the list of words from tagTokens
content = self.getContent(tagTokens)
Expand Down

0 comments on commit 9c98d45

Please sign in to comment.