diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000..f67a0cf --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,41 @@ +##INSTALLATION AND USAGE + +`python setup.py develop` + + - this will check and install required dependencies + +###PREREQUISITES + +- Local copy of the cablegate torrent into data/cablegate.wikileaks.org/ +- MongoDB (http://www.mongodb.org) +- please fetch manually a stable version of pymongo for your system (the pypi one is broken) : http://api.mongodb.org/python/1.9%2B/index.html + +#### Gephi Software + +- You can try Gephi yourself by downloading it from the [downloads page](http://gephi.org/users/download/). +- For any question try the [Gephi forums](http://forum.gephi.org) or [@Gephi](http://twitter.com/gephi) on Twitter. +- You can follow the 5-min tutorial here: [quick-start tutorial](http://gephi.org/users/quick-start/) + +###USAGE + + - start your mongod daemon + - find command-line help : + +`python cablegate_semnet.py -h` + + - to export all the data run: + +`mongoexport -d cables -c cables -o dump/cables.json` + + +###JSON OUTPUT + +`cable = { + "_id" : "XXX", + "origin" : "EMBASSY NAME", + "date_time" : "2000-00-00 00:00", + "classification" : "TAG", + "content" : "text", + "label" : "label", + "id" : "XXX" +}` \ No newline at end of file diff --git a/cableindexer.py b/cableindexer.py index ac5bcd4..c7b573f 100644 --- a/cableindexer.py +++ b/cableindexer.py @@ -43,7 +43,7 @@ def __init__(self, storage, config): def index_cables(self, ngramizer, filters, postagger): """ - gets the a document from storage then extract n-grams + gets the all cables from storage then extract n-grams """ for cable in self.storage.cables.find(): if cable is None: diff --git a/cablenetwork.py b/cablenetwork.py index 612eb4c..9a16cd9 100644 --- a/cablenetwork.py +++ b/cablenetwork.py @@ -16,7 +16,7 @@ import logging logging.basicConfig(level=logging.DEBUG, format="%(levelname)-8s %(message)s") -from bson.code import Code +from pymongo import bson class CoocNetwork(object): """ @@ -31,36 +31,36 @@ def map_reduce(self, ngramizer, filters, postagger, minoccs): """ execute a map-reduce operation on mongodb documents to produce the coocurrence edges matrix """ - result = self.storage.cables.map_reduce( self.get_mapper(), self.get_reducer() ) + result = self.storage.cables.map_reduce( self.get_mapper(), self.get_reducer(), out="cooccurrences", verbose="true" ) logging.info("CableExtractor.map_reduce is done") def get_mapper(self): - return Code( - function() { - for (var ngramid in this.edges.NGram) { - var coocslice = {}; - for (var neighbourid in this.edges.NGram) { - if (neighbourid != ngramid) { - coocslice[neighbourid] = 1; - } - } - emit(ngramid, coocslice); - } - } + return bson.code.Code( + "function() {" + " for (var ngramid in this.edges.NGram) {" + " var coocslice = {};" + " for (var neighbourid in this.edges.NGram[ngramid]) {" + " if (neighbourid != ngramid) {" + " coocslice[neighbourid] = 1;" + " }" + " }" + " emit(ngramid, coocslice);" + " }" + "}" ) def get_reducer(self): - return Code( - function(ngramid, coocslices) { - totalcooc = {}; - for ( var slice in coocslices ) { - for ( var neighbourid in slice ) { - if ( neighbourid in totalcooc ) - totalcooc[neighbourid] += slice[neighbourid] - else - totalcooc[neighbourid] = slice[neighbourid] - } - } - return totalcooc; - } + return bson.code.Code( + "function(ngramid, coocslices) {" + " totalcooc = {};" + " for ( var slice in coocslices ) {" + " for ( var neighbourid in slice ) {" + " if ( neighbourid in totalcooc )" + " totalcooc[neighbourid] += slice[neighbourid];" + " else" + " totalcooc[neighbourid] = slice[neighbourid];" + " }" + " }" + " return totalcooc;" + "}" ) \ No newline at end of file diff --git a/cabletokenizer.py b/cabletokenizer.py index ca94000..8c432dd 100644 --- a/cabletokenizer.py +++ b/cabletokenizer.py @@ -171,17 +171,18 @@ def ngramize(self, document, doc_ngrams, minSize, maxSize, tagTokens, filters, s ngram = self.storage.ngrams.find_one({'_id': ngid}) if ngram is not None: # general edges updates + #ngram.edges.label[label] += 1; + #ngram.edges.Document[document['_id']] += 1; + #ngram.edges.postag[label] = tags[i:n+i]; self.storage.ngrams.update( { '_id': ngid }, { "$inc" : { - 'edges': { - 'label' : { label : 1 }, - 'Document' : { document['_id'] : 1 } - } + 'edges.label': { label : 1 }, + 'edges.Document' : { document['_id'] : 1 } }, - 'edges': { - 'postag' : { label: tags[i:n+i] } + "$set": { + 'edges.postag' : { label: tags[i:n+i] } } } ) @@ -189,11 +190,7 @@ def ngramize(self, document, doc_ngrams, minSize, maxSize, tagTokens, filters, s { '_id': document['id'] }, { '$inc': { - 'edges': { - 'NGram': { - ngid : 1 - } - } + 'edges.NGram': { ngid : 1 } } } ) @@ -221,12 +218,8 @@ def ngramize(self, document, doc_ngrams, minSize, maxSize, tagTokens, filters, s self.storage.cables.update( { '_id': document['id'] }, { - '$inc': { - 'edges': { - 'NGram': { - ngid : 1 - } - } + '$set': { + 'edges.NGram': { ngid : 1 } } } ) diff --git a/execute.py b/execute.py index 5d02f91..20ee8d3 100644 --- a/execute.py +++ b/execute.py @@ -51,5 +51,5 @@ def get_parser(): if options.execute == 'print': for ngram in mongoconnection["cablegate"].ngrams.find().limit(10): logging.debug( ngram ) - for doc in mongoconnection["cablegate"].cables.find().limit(2): + for doc in mongoconnection["cablegate"].cables.find({"$ne": { edges.NGram.length: 0 }}).limit(2): logging.debug( doc ) \ No newline at end of file