Skip to content

Commit

Permalink
starting cable network producer
Browse files Browse the repository at this point in the history
  • Loading branch information
elishowk committed Dec 23, 2010
1 parent 6ca8ae7 commit 1c080ac
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 39 deletions.
8 changes: 4 additions & 4 deletions cableextractor.py → cableindexer.py
Expand Up @@ -23,13 +23,13 @@

from tinasoft.pytextminer import stopwords, filtering, tagger, stemmer

class CableExtractor(object):
class CableIndexer(object):
"""
Reads all database entries to produce a network
usage :
extractor = Exporter(minoccs=2)
"""
def __init__(self, storage, config, minoccs=1):
def __init__(self, storage, config):
self.storage = storage
self.config = config
filters = self._get_extraction_filters()
Expand All @@ -39,9 +39,9 @@ def __init__(self, storage, config, minoccs=1):
trained_pickle = self.config['extraction']['tagger']
)
self.storage.ngrams.remove()
self.index_cables(NGramizer(self.storage, self.config['extraction']), filters, postagger, minoccs)
self.index_cables(NGramizer(self.storage, self.config['extraction']), filters, postagger)

def index_cables(self, ngramizer, filters, postagger, minoccs):
def index_cables(self, ngramizer, filters, postagger):
"""
gets the a document from storage then extract n-grams
"""
Expand Down
66 changes: 66 additions & 0 deletions cablenetwork.py
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2010 elishowk@nonutc.fr
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import logging
logging.basicConfig(level=logging.DEBUG, format="%(levelname)-8s %(message)s")

from bson.code import Code

class CoocNetwork(object):
"""
Reads all database entries to produce a network
"""
def __init__(self, storage, config, minoccs=1):
self.storage = storage
self.config = config
self.map_reduce(minoccs)

def map_reduce(self, ngramizer, filters, postagger, minoccs):
"""
execute a map-reduce operation on mongodb documents to produce the coocurrence edges matrix
"""
result = self.storage.cables.map_reduce( self.get_mapper(), self.get_reducer() )
logging.info("CableExtractor.map_reduce is done")

def get_mapper(self):
return Code(
function() {
for (var ngramid in this.edges.NGram) {
var coocslice = {};
for (var neighbourid in this.edges.NGram) {
if (neighbourid != ngramid) {
coocslice[neighbourid] = 1;
}
}
emit(ngramid, coocslice);
}
}
)

def get_reducer(self):
return Code(
function(ngramid, coocslices) {
totalcooc = {};
for ( var slice in coocslices ) {
for ( var neighbourid in slice ) {
if ( neighbourid in totalcooc )
totalcooc[neighbourid] += slice[neighbourid]
else
totalcooc[neighbourid] = slice[neighbourid]
}
}
return totalcooc;
}
)
56 changes: 30 additions & 26 deletions cabletokenizer.py
Expand Up @@ -201,34 +201,38 @@ def ngramize(self, document, doc_ngrams, minSize, maxSize, tagTokens, filters, s

else:
# id made from the stemmedcontent and label made from the real tokens
ngram = {
'_id': ngid,
'id': ngid,
'label': label,
'content': content[i:n+i],
'edges': {
'postag' : { label : tags[i:n+i] },
'label': { label : 1 },
'Document': { document['id'] : 1 }
},
'postag' : tags[i:n+i]
}
# application defined filtering
if filtering.apply_filters(ngram, filters) is True:
doc_ngrams += [ngid]
self.storage.ngrams.insert(ngram)
self.storage.cables.update(
{ '_id': document['id'] },
{
'$inc': {
'edges': {
'NGram': {
ngid : 1
try:
ngram = {
'_id': ngid,
'id': ngid,
'label': label,
'content': content[i:n+i],
'edges': {
'postag' : { label : tags[i:n+i] },
'label': { label : 1 },
'Document': { document['id'] : 1 }
},
'postag' : tags[i:n+i]
}
# application defined filtering
if filtering.apply_filters(ngram, filters) is True:
doc_ngrams += [ngid]
self.storage.ngrams.insert(ngram)
self.storage.cables.update(
{ '_id': document['id'] },
{
'$inc': {
'edges': {
'NGram': {
ngid : 1
}
}
}
}
}
)
#logging.debug( self.storage.ngrams.find_one({ '_id': ngid }, { 'edges': 1 }) )
)
#logging.debug( self.storage.ngrams.find_one({ '_id': ngid }, { 'edges': 1 }) )

except Exception, exc:
logging.error("error inserting new ngram %s : %s"%(label, exc))

return doc_ngrams
24 changes: 15 additions & 9 deletions execute.py
Expand Up @@ -21,16 +21,16 @@

from mongodbhandler import CablegateDatabase
from cableimporter import CableImporter
from cableextractor import CableExtractor

from datamodel import NGram
from cableindexer import CableIndexer
from cablenetwork import CoocNetwork

import yaml

def get_parser():
parser = OptionParser()
parser.add_option("-e", "--execute", dest="execute", help="execution action", metavar="FILE")
parser.add_option("-a", "--archive", dest="archive", help="cablegate archive path", metavar="FILE")
parser.add_option("-o", "--occurrences", dest="minoccs", help="minimum keyphrase occurrence", metavar="int")
parser.add_option("-o", "--occurrences", dest="minoccs", help="minimum keyphrases' occurrences", metavar="int")
parser.add_option("-c", "--config", dest="config", help="config yaml file path", metavar="FILE")
return parser

Expand All @@ -42,8 +42,14 @@ def get_parser():
config = yaml.safe_load( file( options.config, 'rU' ) )

mongoconnection = CablegateDatabase("localhost")
#importer = CableImporter( mongoconnection["cablegate"], options.archive )
extractor = CableExtractor(mongoconnection["cablegate"], config, int(options.minoccs))
for ngram in mongoconnection["cablegate"].ngrams.find().limit(10):
obj = NGram(ngram)
logging.debug( obj.data )
if options.execute == 'import':
importer = CableImporter( mongoconnection["cablegate"], options.archive )
if options.execute == 'index':
extractor = CableIndexer(mongoconnection["cablegate"], config)
if options.execute == 'graph':
extractor = CoocGraph(mongoconnection["cablegate"], config, int(options.minoccs))
if options.execute == 'print':
for ngram in mongoconnection["cablegate"].ngrams.find().limit(10):
logging.debug( ngram )
for doc in mongoconnection["cablegate"].cables.find().limit(2):
logging.debug( doc )

0 comments on commit 1c080ac

Please sign in to comment.