From d3725f94b59ab3c6a5b3a39efe4bdab97bd19952 Mon Sep 17 00:00:00 2001 From: elishowk Date: Thu, 3 Feb 2011 20:25:18 +0100 Subject: [PATCH] new tfidf script --- tfidf.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tfidf.py diff --git a/tfidf.py b/tfidf.py new file mode 100644 index 0000000..9e45410 --- /dev/null +++ b/tfidf.py @@ -0,0 +1,13 @@ +import pymongo +from pymongo import Connection +MONGODB_PORT = 27017 +import nltk +from nltk.corpus import brown +from nltk.text import TextCollection +mongodb=Connection("localhost", MONGODB_PORT)['cablegate'] +browntext = TextCollection(brown.words(categories=['news','government'])) +count=0 +for ng in mongodb.ngrams.find(timeout=False): + mongodb.ngrams.update({"_id":ng["_id"]},{"$set":{"tfidf": browntext.tf_idf(ng['label'],brown.words(categories=['news','government'])) }}) + count+=1 + print "updated tfidf for %d topics"%count