Skip to content

Commit

Permalink
Updated training script to use lat/lon
Browse files Browse the repository at this point in the history
  • Loading branch information
brendannee committed Mar 31, 2012
1 parent a908ac6 commit 787816d
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 76 deletions.
45 changes: 20 additions & 25 deletions lib/languages.js
Expand Up @@ -2,106 +2,101 @@ module.exports = [
{
code: 'ar'
, name: 'Arabic'
, loc: 'e5ac80d8b565c01f' //Yemen
, coords: [46.75, 24.73] //Riyadh, Saudi Arabia
}
, {
code: 'zh'
, name: 'Chinese'
, loc: '204a435ce97d5de4' //Taiwan
, coords: [106.54, 29.55] //Chingqing, China
}
, {
code: 'de'
, name: 'German'
, loc: 'f5f67396c9566816' //Hamburg, Germany
, coords: [10.497, 52.277] //Braunschweig, Germany
}
, {
code: 'el'
, name: 'Greek'
, loc: '2ee7eeaa84dbe65a' //Greece
, coords: [22.41, 39.627] //Larissa, Greece
}
, {
code: 'en'
, name: 'English'
, loc: 'fc7c2e706034396b' //Normal, IL
, coords: [-88.984, 40.5125] //Normal, IL
}
, {
code: 'fi'
, name: 'Finnish'
, loc: 'e3ba9e096a0fc232' //Tampere, Finland
, coords: [23.75, 61.49] //Tampere, Finland
}
, {
code: 'fr'
, name: 'French'
, loc: '3a4da89bf023f210' //Bordeaux, France
, coords: [4.81, 45.75] // Lyon, France
}
, {
code: 'id'
, name: 'Indonesian'
, loc: '881b4601a1463b28' //Surakarta, Indonesia
, coords: [110.8177, -7.5669] //Surakarta, Indonesia
}
, {
code: 'it'
, name: 'Italian'
, loc: '45ced083529b6f19' //Bari, Italy
, coords: [14.16, 37.481] //Sicily, Italy
}
, {
code: 'ja'
, name: 'Japanese'
, loc: 'c68b1ffd6bd34468' //Nagoya, Japan
, coords: [135.503, 34.6936] //Osaka, Japan
}
, {
code: 'ko'
, name: 'Korean'
, loc: 'c00e5392b3fa46fb' //South Korea
}
, {
code: 'mn'
, name: 'Mongolian'
, loc: '8bd61930ae4f642d' //Mongolia
, coords: [127.37, 36.36] //South Korea
}
, {
code: 'nn'
, name: 'Norweigian'
, loc: 'e214513434b11e13' //Trondheim, Norway
, coords: [10.43, 63.38] //Trondheim, Norway
}
, {
code: 'fa'
, name: 'Persian'
, loc: '272596500e51c07a' //Iran
, coords: [51.69, 32.63] //Isfahan, Iran
}
, {
code: 'pt'
, name: 'Portugese'
, loc: 'f207b85be9f1513e' //Recife, Brazil
, coords: [-43.98, -19.92] //Belo Horizonte, Brazil
}
, {
code: 'ru'
, name: 'Russian'
, loc: '22deffea18a2f1d2' //Moscow, Russia
, coords: [44.1, 56.37] //Novogorad, Russia
}
, {
code: 'es'
, name: 'Spanish'
, loc: 'df18487281b6c832' //Burgos, Spain
, coords: [-4.726, 41.645] //Valladolid, Spain
}
, {
code: 'sv'
, name: 'Swedish'
, loc: 'c14a026657bbfe31' //Sundsvall, Sweden
, coords: [15.16, 59.27] //Orebro, Sweden
}
, {
code: 'th'
, name: 'Thai'
, loc: '974c290e10850494' //Thailand
, coords: [102.11, 14.97] //Ratchasima, Thailand
}
, {
code: 'tr'
, name: 'Turkish'
, loc: '682c5a667856ef42' //Turkey
, coords: [32.52, 37.87] //Konya, Turkey
}
, {
code: 'vi'
, name: 'Vietnamese'
, loc: '2371490f9d073edc' //Vietnam
, coords: [105.70, 18.67] //Vinh, Vietnam
}
];
105 changes: 54 additions & 51 deletions scripts/train.js
Expand Up @@ -39,55 +39,51 @@ function getTrainingData(cb){
async.forEachSeries(languages, trainLanguage, cb);

function trainLanguage(language, cb){
if(language.loc){
var tweetCount
, requestCount = 0
, id_str = null
, noMoreTweets = false;
async.until(
function(){ return (tweetCount > 300 || noMoreTweets || requestCount >= 10) },
getTweets,
function(e) {
console.log('Downloaded ' + language.name + ': ' + tweetCount + ' tweets');
var tweetCount
, requestCount = 0
, id_str = null
, noMoreTweets = false;
async.until(
function(){ return (tweetCount > 300 || noMoreTweets || requestCount >= 10) },
getTweets,
function(e) {
console.log('Downloaded ' + language.name + ': ' + tweetCount + ' tweets');
cb();
}
);

function getTweets(cb){
twit.search('', {rpp: 100, max_id: id_str, geocode: language.coords[1] + ',' + language.coords[0] + ',50mi'}, function(e, data) {
requestCount++;
if(data.results && data.results.length > 1){
id_str = data.results[data.results.length - 1].id_str;
async.forEachSeries(data.results, processTweet, function(e){
//find out how many tweets we have in that language
Tweet
.where('trained', true)
.where('trained_language', language.code)
.count(function(e, count){
tweetCount = count;
cb();
});
});
} else {
noMoreTweets = true;
cb();
}
);

function getTweets(cb){
twit.search('place:' + language.loc, {rpp: 100, max_id: id_str}, function(e, data) {
requestCount++;
if(data.results && data.results.length > 1){
id_str = data.results[data.results.length - 1].id_str;
async.forEachSeries(data.results, processTweet, function(e){
//find out how many tweets we have in that language
Tweet
.where('trained', true)
.where('trained_language', language.code)
.count(function(e, count){
tweetCount = count;
cb();
});
});
} else {
noMoreTweets = true;
cb();
}
});
}
});
}

function processTweet(data, cb){
//classify tweet based on language
var tweet = new Tweet(data);
function processTweet(data, cb){
//classify tweet based on language
var tweet = new Tweet(data);

tweet.trained_language = language.code;
tweet.trained = true;
tweet.autotrained = true;
tweet.save(function(e, result){
cb();
});
}
} else {
cb();
tweet.trained_language = language.code;
tweet.trained = true;
tweet.autotrained = true;
tweet.save(function(e, result){
cb();
});
}
}
}
Expand Down Expand Up @@ -118,12 +114,19 @@ function countWords(cb){
});

function parseTweet(tweet, cb){
async.forEach(tweet.getWords(), function(word, cb){
var updateField = "count." + tweet.trained_language
, update = {$inc: {}};
update.$inc[updateField] = 1;
Probability.update({word: word}, update, {upsert: true}, cb);
}, cb);
var words = tweet.getWords();

//don't process short tweets with fewer than 4 words
if(words.length > 3){
async.forEach(tweet.getWords(), function(word, cb){
var updateField = "count." + tweet.trained_language
, update = {$inc: {}};
update.$inc[updateField] = 1;
Probability.update({word: word}, update, {upsert: true}, cb);
}, cb);
} else {
cb();
}
}
},
cb
Expand Down

0 comments on commit 787816d

Please sign in to comment.