From 1c080ac2884795dd9a7e6acab756459a7885587d Mon Sep 17 00:00:00 2001 From: elishowk Date: Thu, 23 Dec 2010 19:36:23 +0100 Subject: [PATCH] starting cable network producer --- cableextractor.py => cableindexer.py | 8 ++-- cablenetwork.py | 66 ++++++++++++++++++++++++++++ cabletokenizer.py | 56 ++++++++++++----------- execute.py | 24 ++++++---- 4 files changed, 115 insertions(+), 39 deletions(-) rename cableextractor.py => cableindexer.py (92%) create mode 100644 cablenetwork.py diff --git a/cableextractor.py b/cableindexer.py similarity index 92% rename from cableextractor.py rename to cableindexer.py index 9bab5f4..ac5bcd4 100644 --- a/cableextractor.py +++ b/cableindexer.py @@ -23,13 +23,13 @@ from tinasoft.pytextminer import stopwords, filtering, tagger, stemmer -class CableExtractor(object): +class CableIndexer(object): """ Reads all database entries to produce a network usage : extractor = Exporter(minoccs=2) """ - def __init__(self, storage, config, minoccs=1): + def __init__(self, storage, config): self.storage = storage self.config = config filters = self._get_extraction_filters() @@ -39,9 +39,9 @@ def __init__(self, storage, config, minoccs=1): trained_pickle = self.config['extraction']['tagger'] ) self.storage.ngrams.remove() - self.index_cables(NGramizer(self.storage, self.config['extraction']), filters, postagger, minoccs) + self.index_cables(NGramizer(self.storage, self.config['extraction']), filters, postagger) - def index_cables(self, ngramizer, filters, postagger, minoccs): + def index_cables(self, ngramizer, filters, postagger): """ gets the a document from storage then extract n-grams """ diff --git a/cablenetwork.py b/cablenetwork.py new file mode 100644 index 0000000..612eb4c --- /dev/null +++ b/cablenetwork.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2010 elishowk@nonutc.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +import logging +logging.basicConfig(level=logging.DEBUG, format="%(levelname)-8s %(message)s") + +from bson.code import Code + +class CoocNetwork(object): + """ + Reads all database entries to produce a network + """ + def __init__(self, storage, config, minoccs=1): + self.storage = storage + self.config = config + self.map_reduce(minoccs) + + def map_reduce(self, ngramizer, filters, postagger, minoccs): + """ + execute a map-reduce operation on mongodb documents to produce the coocurrence edges matrix + """ + result = self.storage.cables.map_reduce( self.get_mapper(), self.get_reducer() ) + logging.info("CableExtractor.map_reduce is done") + + def get_mapper(self): + return Code( + function() { + for (var ngramid in this.edges.NGram) { + var coocslice = {}; + for (var neighbourid in this.edges.NGram) { + if (neighbourid != ngramid) { + coocslice[neighbourid] = 1; + } + } + emit(ngramid, coocslice); + } + } + ) + + def get_reducer(self): + return Code( + function(ngramid, coocslices) { + totalcooc = {}; + for ( var slice in coocslices ) { + for ( var neighbourid in slice ) { + if ( neighbourid in totalcooc ) + totalcooc[neighbourid] += slice[neighbourid] + else + totalcooc[neighbourid] = slice[neighbourid] + } + } + return totalcooc; + } + ) \ No newline at end of file diff --git a/cabletokenizer.py b/cabletokenizer.py index a2c70ed..ca94000 100644 --- a/cabletokenizer.py +++ b/cabletokenizer.py @@ -201,34 +201,38 @@ def ngramize(self, document, doc_ngrams, minSize, maxSize, tagTokens, filters, s else: # id made from the stemmedcontent and label made from the real tokens - ngram = { - '_id': ngid, - 'id': ngid, - 'label': label, - 'content': content[i:n+i], - 'edges': { - 'postag' : { label : tags[i:n+i] }, - 'label': { label : 1 }, - 'Document': { document['id'] : 1 } - }, - 'postag' : tags[i:n+i] - } - # application defined filtering - if filtering.apply_filters(ngram, filters) is True: - doc_ngrams += [ngid] - self.storage.ngrams.insert(ngram) - self.storage.cables.update( - { '_id': document['id'] }, - { - '$inc': { - 'edges': { - 'NGram': { - ngid : 1 + try: + ngram = { + '_id': ngid, + 'id': ngid, + 'label': label, + 'content': content[i:n+i], + 'edges': { + 'postag' : { label : tags[i:n+i] }, + 'label': { label : 1 }, + 'Document': { document['id'] : 1 } + }, + 'postag' : tags[i:n+i] + } + # application defined filtering + if filtering.apply_filters(ngram, filters) is True: + doc_ngrams += [ngid] + self.storage.ngrams.insert(ngram) + self.storage.cables.update( + { '_id': document['id'] }, + { + '$inc': { + 'edges': { + 'NGram': { + ngid : 1 + } } } } - } - ) - #logging.debug( self.storage.ngrams.find_one({ '_id': ngid }, { 'edges': 1 }) ) + ) + #logging.debug( self.storage.ngrams.find_one({ '_id': ngid }, { 'edges': 1 }) ) + + except Exception, exc: + logging.error("error inserting new ngram %s : %s"%(label, exc)) return doc_ngrams \ No newline at end of file diff --git a/execute.py b/execute.py index 85218ab..5d02f91 100644 --- a/execute.py +++ b/execute.py @@ -21,16 +21,16 @@ from mongodbhandler import CablegateDatabase from cableimporter import CableImporter -from cableextractor import CableExtractor - -from datamodel import NGram +from cableindexer import CableIndexer +from cablenetwork import CoocNetwork import yaml def get_parser(): parser = OptionParser() + parser.add_option("-e", "--execute", dest="execute", help="execution action", metavar="FILE") parser.add_option("-a", "--archive", dest="archive", help="cablegate archive path", metavar="FILE") - parser.add_option("-o", "--occurrences", dest="minoccs", help="minimum keyphrase occurrence", metavar="int") + parser.add_option("-o", "--occurrences", dest="minoccs", help="minimum keyphrases' occurrences", metavar="int") parser.add_option("-c", "--config", dest="config", help="config yaml file path", metavar="FILE") return parser @@ -42,8 +42,14 @@ def get_parser(): config = yaml.safe_load( file( options.config, 'rU' ) ) mongoconnection = CablegateDatabase("localhost") - #importer = CableImporter( mongoconnection["cablegate"], options.archive ) - extractor = CableExtractor(mongoconnection["cablegate"], config, int(options.minoccs)) - for ngram in mongoconnection["cablegate"].ngrams.find().limit(10): - obj = NGram(ngram) - logging.debug( obj.data ) \ No newline at end of file + if options.execute == 'import': + importer = CableImporter( mongoconnection["cablegate"], options.archive ) + if options.execute == 'index': + extractor = CableIndexer(mongoconnection["cablegate"], config) + if options.execute == 'graph': + extractor = CoocGraph(mongoconnection["cablegate"], config, int(options.minoccs)) + if options.execute == 'print': + for ngram in mongoconnection["cablegate"].ngrams.find().limit(10): + logging.debug( ngram ) + for doc in mongoconnection["cablegate"].cables.find().limit(2): + logging.debug( doc ) \ No newline at end of file