Skip to content

Commit

Permalink
Added training function to pull streams from various lat/lons to get …
Browse files Browse the repository at this point in the history
…language training data
  • Loading branch information
brendannee committed Mar 24, 2012
1 parent e2657d2 commit 6ff856c
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 30 deletions.
62 changes: 47 additions & 15 deletions lib/train.js
@@ -1,16 +1,48 @@
{
"name": "naive-baysian-twitter",
"description": "",
"version": "0.0.0",
"author": "Brendan Nee <me@bn.ee>",
"dependencies": {
"express": ">= 2.5.8"
, "mongoose": ">= 2.5.7"
, "async": ">= 0.1.16"
, "request": ">= 2.9.152"
, "underscore": ">= 1.3.1"
, "ntwitter": ">=0.2.10"
, "socket.io": ">=0.9.1-1"
},
"main": "index"
var models = require('../models/models')
, async = require('async')
, _ = require('underscore')
, languages = require('./languages');

module.exports = function train(app, cb){
var Tweet = app.set('db').model('Tweet')
, Probability = app.set('db').model('Probability')
, twit = app.set('twit');

async.forEachSeries(languages, trainLanguage, cb);

function trainLanguage(language, cb){
try{
console.log('Training for ' + language.name);
//get approx 100 mile bounding box around location
var boxWidth = 100/69;
var sampleTime = 60000
twit.search('place:' + language.loc, {rpp:100}, function(err, data) {
console.log(data.results.length);
async.forEachSeries(data.results, processTweet, function(e, results){
cb();
});

});
} catch(e) {
cb();
}

function processTweet(data, cb){
try{
//classify tweet based on language
var tweet = new Tweet(data);

tweet.trained_language = language.code;
tweet.trained = true;
tweet.autotrained = true;
tweet.save(function(e, result){
cb();
});
} catch(e) {
console.log('error');
cb();
}
}
}

}
12 changes: 7 additions & 5 deletions models/models.js
@@ -1,5 +1,6 @@
var mongoose = require('mongoose')
, _ = require('underscore');
, _ = require('underscore')
, languages = require('../lib/languages');



Expand All @@ -17,6 +18,7 @@ var TweetSchema = new mongoose.Schema({
, predicted_language: { type: String }
, trained_language : { type: String }
, trained : { type: Boolean, index: true, default: false }
, autotrained : { type: Boolean, default: false }
});

TweetSchema.methods.getWords = function getWords(cb){
Expand Down Expand Up @@ -69,17 +71,17 @@ TweetSchema.methods.classify = function classify(cb){
.run(function(e, results){
var update = {};
languages.forEach(function(language){
var product = _.reduce(results, function(memo, word){ return memo * word.probability[language] || memo; }, 1);
var subtract = _.reduce(results, function(memo, word){ return memo * (1 - word.probability[language]) || memo; }, 1);
var product = _.reduce(results, function(memo, word){ return memo * word.probability[language.code] || memo; }, 1);
var subtract = _.reduce(results, function(memo, word){ return memo * (1 - word.probability[language.code]) || memo; }, 1);

//minimum probability of 0.01
var result = product / ( product + subtract ) || 0.01;

probability[language] = Math.round(result*100000)/100000;
probability[language.code] = Math.round(result*100000)/100000;

if(result > max_prob) {
max_prob = result;
predicted_language = language;
predicted_language = language.code;
}
});

Expand Down
26 changes: 16 additions & 10 deletions routes/index.js
@@ -1,7 +1,8 @@
var models = require('../models/models')
, train = require('../lib/train')
, async = require('async')
, _ = require('underscore')
, languages = ['en', 'es', 'pt', 'fr', 'other'];
, languages = require('../lib/languages');

module.exports = function routes(app){

Expand Down Expand Up @@ -90,17 +91,17 @@ module.exports = function routes(app){
.run(function(e, results){
var update = {};
languages.forEach(function(language){
var product = _.reduce(results, function(memo, word){ return memo * word.probability[language] || memo; }, 1);
var subtract = _.reduce(results, function(memo, word){ return memo * (1 - word.probability[language]) || memo; }, 1);
var product = _.reduce(results, function(memo, word){ return memo * word.probability[language.code] || memo; }, 1);
var subtract = _.reduce(results, function(memo, word){ return memo * (1 - word.probability[language.code]) || memo; }, 1);

//minimum probability of 0.01
var result = product / ( product + subtract ) || 0.01;

probability[language] = Math.round(result*100000)/100000;
probability[language.code] = Math.round(result*100000)/100000;

if(result > max_prob) {
max_prob = result;
predicted_language = language;
predicted_language = language.code;
}
});

Expand All @@ -114,7 +115,6 @@ module.exports = function routes(app){

});
}

});


Expand Down Expand Up @@ -219,9 +219,9 @@ module.exports = function routes(app){

//Get counts for each tweet and word
async.forEach(languages, function(language, cb){
wordCount[language] = word.count[language] || 0;
Tweet.count({ trained_language: language }, function(e, count){
tweetCount[language] = count;
wordCount[language.code] = word.count[language.code] || 0;
Tweet.count({ trained_language: language.code }, function(e, count){
tweetCount[language.code] = count;
cb();
});
}, function(e, results){
Expand All @@ -234,7 +234,7 @@ module.exports = function routes(app){
//Minimum probability of 0.01

languages.forEach(function(language){
word.probability[language] = Math.max(0.01, ( wordCount[language] / tweetCount[language] ) / ( ( wordCount[language] / tweetCount[language] ) + ( ( totalWordCount - wordCount[language] ) / ( totalTweetCount - tweetCount[language] ) ) ) ) || 0.01;
word.probability[language.code] = Math.max(0.01, ( wordCount[language.code] / tweetCount[language.code] ) / ( ( wordCount[language.code] / tweetCount[language.code] ) + ( ( totalWordCount - wordCount[language.code] ) / ( totalTweetCount - tweetCount[language.code] ) ) ) ) || 0.01;
});

//save probabilities
Expand All @@ -256,6 +256,12 @@ module.exports = function routes(app){
});
}

app.get('/api/train', function(req, res){
train(app, function(){
res.json({status: 'completed'});
});
});


//Nothing specified
app.all('*', function notFound(req, res) {
Expand Down

0 comments on commit 6ff856c

Please sign in to comment.