Skip to content

Commit

Permalink
moved usage to INSTALL.md
Browse files Browse the repository at this point in the history
  • Loading branch information
elishowk committed Dec 24, 2010
1 parent 4618f0a commit 7c27487
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 46 deletions.
41 changes: 41 additions & 0 deletions INSTALL.md
@@ -0,0 +1,41 @@
##INSTALLATION AND USAGE

`python setup.py develop`

- this will check and install required dependencies

###PREREQUISITES

- Local copy of the cablegate torrent into data/cablegate.wikileaks.org/
- MongoDB (http://www.mongodb.org)
- please fetch manually a stable version of pymongo for your system (the pypi one is broken) : http://api.mongodb.org/python/1.9%2B/index.html

#### Gephi Software

- You can try Gephi yourself by downloading it from the [downloads page](http://gephi.org/users/download/).
- For any question try the [Gephi forums](http://forum.gephi.org) or [@Gephi](http://twitter.com/gephi) on Twitter.
- You can follow the 5-min tutorial here: [quick-start tutorial](http://gephi.org/users/quick-start/)

###USAGE

- start your mongod daemon
- find command-line help :

`python cablegate_semnet.py -h`

- to export all the data run:

`mongoexport -d cables -c cables -o dump/cables.json`


###JSON OUTPUT

`cable = {
"_id" : "XXX",
"origin" : "EMBASSY NAME",
"date_time" : "2000-00-00 00:00",
"classification" : "TAG",
"content" : "text",
"label" : "label",
"id" : "XXX"
}`
2 changes: 1 addition & 1 deletion cableindexer.py
Expand Up @@ -43,7 +43,7 @@ def __init__(self, storage, config):

def index_cables(self, ngramizer, filters, postagger):
"""
gets the a document from storage then extract n-grams
gets the all cables from storage then extract n-grams
"""
for cable in self.storage.cables.find():
if cable is None:
Expand Down
54 changes: 27 additions & 27 deletions cablenetwork.py
Expand Up @@ -16,7 +16,7 @@
import logging
logging.basicConfig(level=logging.DEBUG, format="%(levelname)-8s %(message)s")

from bson.code import Code
from pymongo import bson

class CoocNetwork(object):
"""
Expand All @@ -31,36 +31,36 @@ def map_reduce(self, ngramizer, filters, postagger, minoccs):
"""
execute a map-reduce operation on mongodb documents to produce the coocurrence edges matrix
"""
result = self.storage.cables.map_reduce( self.get_mapper(), self.get_reducer() )
result = self.storage.cables.map_reduce( self.get_mapper(), self.get_reducer(), out="cooccurrences", verbose="true" )
logging.info("CableExtractor.map_reduce is done")

def get_mapper(self):
return Code(
function() {
for (var ngramid in this.edges.NGram) {
var coocslice = {};
for (var neighbourid in this.edges.NGram) {
if (neighbourid != ngramid) {
coocslice[neighbourid] = 1;
}
}
emit(ngramid, coocslice);
}
}
return bson.code.Code(
"function() {"
" for (var ngramid in this.edges.NGram) {"
" var coocslice = {};"
" for (var neighbourid in this.edges.NGram[ngramid]) {"
" if (neighbourid != ngramid) {"
" coocslice[neighbourid] = 1;"
" }"
" }"
" emit(ngramid, coocslice);"
" }"
"}"
)

def get_reducer(self):
return Code(
function(ngramid, coocslices) {
totalcooc = {};
for ( var slice in coocslices ) {
for ( var neighbourid in slice ) {
if ( neighbourid in totalcooc )
totalcooc[neighbourid] += slice[neighbourid]
else
totalcooc[neighbourid] = slice[neighbourid]
}
}
return totalcooc;
}
return bson.code.Code(
"function(ngramid, coocslices) {"
" totalcooc = {};"
" for ( var slice in coocslices ) {"
" for ( var neighbourid in slice ) {"
" if ( neighbourid in totalcooc )"
" totalcooc[neighbourid] += slice[neighbourid];"
" else"
" totalcooc[neighbourid] = slice[neighbourid];"
" }"
" }"
" return totalcooc;"
"}"
)
27 changes: 10 additions & 17 deletions cabletokenizer.py
Expand Up @@ -171,29 +171,26 @@ def ngramize(self, document, doc_ngrams, minSize, maxSize, tagTokens, filters, s
ngram = self.storage.ngrams.find_one({'_id': ngid})
if ngram is not None:
# general edges updates
#ngram.edges.label[label] += 1;
#ngram.edges.Document[document['_id']] += 1;
#ngram.edges.postag[label] = tags[i:n+i];
self.storage.ngrams.update(
{ '_id': ngid },
{
"$inc" : {
'edges': {
'label' : { label : 1 },
'Document' : { document['_id'] : 1 }
}
'edges.label': { label : 1 },
'edges.Document' : { document['_id'] : 1 }
},
'edges': {
'postag' : { label: tags[i:n+i] }
"$set": {
'edges.postag' : { label: tags[i:n+i] }
}
}
)
self.storage.cables.update(
{ '_id': document['id'] },
{
'$inc': {
'edges': {
'NGram': {
ngid : 1
}
}
'edges.NGram': { ngid : 1 }
}
}
)
Expand Down Expand Up @@ -221,12 +218,8 @@ def ngramize(self, document, doc_ngrams, minSize, maxSize, tagTokens, filters, s
self.storage.cables.update(
{ '_id': document['id'] },
{
'$inc': {
'edges': {
'NGram': {
ngid : 1
}
}
'$set': {
'edges.NGram': { ngid : 1 }
}
}
)
Expand Down
2 changes: 1 addition & 1 deletion execute.py
Expand Up @@ -51,5 +51,5 @@ def get_parser():
if options.execute == 'print':
for ngram in mongoconnection["cablegate"].ngrams.find().limit(10):
logging.debug( ngram )
for doc in mongoconnection["cablegate"].cables.find().limit(2):
for doc in mongoconnection["cablegate"].cables.find({"$ne": { edges.NGram.length: 0 }}).limit(2):
logging.debug( doc )

0 comments on commit 7c27487

Please sign in to comment.