Skip to content
Browse files

Updated training script to use lat/lon

  • Loading branch information...
1 parent a908ac6 commit 787816d790ade32eae612fcf198a7dc2ede323ee @brendannee committed Mar 31, 2012
Showing with 74 additions and 76 deletions.
  1. +20 −25 lib/languages.js
  2. +54 −51 scripts/train.js
View
45 lib/languages.js
@@ -2,106 +2,101 @@ module.exports = [
{
code: 'ar'
, name: 'Arabic'
- , loc: 'e5ac80d8b565c01f' //Yemen
+ , coords: [46.75, 24.73] //Riyadh, Saudi Arabia
}
, {
code: 'zh'
, name: 'Chinese'
- , loc: '204a435ce97d5de4' //Taiwan
+ , coords: [106.54, 29.55] //Chingqing, China
}
, {
code: 'de'
, name: 'German'
- , loc: 'f5f67396c9566816' //Hamburg, Germany
+ , coords: [10.497, 52.277] //Braunschweig, Germany
}
, {
code: 'el'
, name: 'Greek'
- , loc: '2ee7eeaa84dbe65a' //Greece
+ , coords: [22.41, 39.627] //Larissa, Greece
}
, {
code: 'en'
, name: 'English'
- , loc: 'fc7c2e706034396b' //Normal, IL
+ , coords: [-88.984, 40.5125] //Normal, IL
}
, {
code: 'fi'
, name: 'Finnish'
- , loc: 'e3ba9e096a0fc232' //Tampere, Finland
+ , coords: [23.75, 61.49] //Tampere, Finland
}
, {
code: 'fr'
, name: 'French'
- , loc: '3a4da89bf023f210' //Bordeaux, France
+ , coords: [4.81, 45.75] // Lyon, France
}
, {
code: 'id'
, name: 'Indonesian'
- , loc: '881b4601a1463b28' //Surakarta, Indonesia
+ , coords: [110.8177, -7.5669] //Surakarta, Indonesia
}
, {
code: 'it'
, name: 'Italian'
- , loc: '45ced083529b6f19' //Bari, Italy
+ , coords: [14.16, 37.481] //Sicily, Italy
}
, {
code: 'ja'
, name: 'Japanese'
- , loc: 'c68b1ffd6bd34468' //Nagoya, Japan
+ , coords: [135.503, 34.6936] //Osaka, Japan
}
, {
code: 'ko'
, name: 'Korean'
- , loc: 'c00e5392b3fa46fb' //South Korea
- }
-, {
- code: 'mn'
- , name: 'Mongolian'
- , loc: '8bd61930ae4f642d' //Mongolia
+ , coords: [127.37, 36.36] //South Korea
}
, {
code: 'nn'
, name: 'Norweigian'
- , loc: 'e214513434b11e13' //Trondheim, Norway
+ , coords: [10.43, 63.38] //Trondheim, Norway
}
, {
code: 'fa'
, name: 'Persian'
- , loc: '272596500e51c07a' //Iran
+ , coords: [51.69, 32.63] //Isfahan, Iran
}
, {
code: 'pt'
, name: 'Portugese'
- , loc: 'f207b85be9f1513e' //Recife, Brazil
+ , coords: [-43.98, -19.92] //Belo Horizonte, Brazil
}
, {
code: 'ru'
, name: 'Russian'
- , loc: '22deffea18a2f1d2' //Moscow, Russia
+ , coords: [44.1, 56.37] //Novogorad, Russia
}
, {
code: 'es'
, name: 'Spanish'
- , loc: 'df18487281b6c832' //Burgos, Spain
+ , coords: [-4.726, 41.645] //Valladolid, Spain
}
, {
code: 'sv'
, name: 'Swedish'
- , loc: 'c14a026657bbfe31' //Sundsvall, Sweden
+ , coords: [15.16, 59.27] //Orebro, Sweden
}
, {
code: 'th'
, name: 'Thai'
- , loc: '974c290e10850494' //Thailand
+ , coords: [102.11, 14.97] //Ratchasima, Thailand
}
, {
code: 'tr'
, name: 'Turkish'
- , loc: '682c5a667856ef42' //Turkey
+ , coords: [32.52, 37.87] //Konya, Turkey
}
, {
code: 'vi'
, name: 'Vietnamese'
- , loc: '2371490f9d073edc' //Vietnam
+ , coords: [105.70, 18.67] //Vinh, Vietnam
}
];
View
105 scripts/train.js
@@ -39,55 +39,51 @@ function getTrainingData(cb){
async.forEachSeries(languages, trainLanguage, cb);
function trainLanguage(language, cb){
- if(language.loc){
- var tweetCount
- , requestCount = 0
- , id_str = null
- , noMoreTweets = false;
- async.until(
- function(){ return (tweetCount > 300 || noMoreTweets || requestCount >= 10) },
- getTweets,
- function(e) {
- console.log('Downloaded ' + language.name + ': ' + tweetCount + ' tweets');
+ var tweetCount
+ , requestCount = 0
+ , id_str = null
+ , noMoreTweets = false;
+ async.until(
+ function(){ return (tweetCount > 300 || noMoreTweets || requestCount >= 10) },
+ getTweets,
+ function(e) {
+ console.log('Downloaded ' + language.name + ': ' + tweetCount + ' tweets');
+ cb();
+ }
+ );
+
+ function getTweets(cb){
+ twit.search('', {rpp: 100, max_id: id_str, geocode: language.coords[1] + ',' + language.coords[0] + ',50mi'}, function(e, data) {
+ requestCount++;
+ if(data.results && data.results.length > 1){
+ id_str = data.results[data.results.length - 1].id_str;
+ async.forEachSeries(data.results, processTweet, function(e){
+ //find out how many tweets we have in that language
+ Tweet
+ .where('trained', true)
+ .where('trained_language', language.code)
+ .count(function(e, count){
+ tweetCount = count;
+ cb();
+ });
+ });
+ } else {
+ noMoreTweets = true;
cb();
}
- );
-
- function getTweets(cb){
- twit.search('place:' + language.loc, {rpp: 100, max_id: id_str}, function(e, data) {
- requestCount++;
- if(data.results && data.results.length > 1){
- id_str = data.results[data.results.length - 1].id_str;
- async.forEachSeries(data.results, processTweet, function(e){
- //find out how many tweets we have in that language
- Tweet
- .where('trained', true)
- .where('trained_language', language.code)
- .count(function(e, count){
- tweetCount = count;
- cb();
- });
- });
- } else {
- noMoreTweets = true;
- cb();
- }
- });
- }
+ });
+ }
- function processTweet(data, cb){
- //classify tweet based on language
- var tweet = new Tweet(data);
+ function processTweet(data, cb){
+ //classify tweet based on language
+ var tweet = new Tweet(data);
- tweet.trained_language = language.code;
- tweet.trained = true;
- tweet.autotrained = true;
- tweet.save(function(e, result){
- cb();
- });
- }
- } else {
- cb();
+ tweet.trained_language = language.code;
+ tweet.trained = true;
+ tweet.autotrained = true;
+ tweet.save(function(e, result){
+ cb();
+ });
}
}
}
@@ -118,12 +114,19 @@ function countWords(cb){
});
function parseTweet(tweet, cb){
- async.forEach(tweet.getWords(), function(word, cb){
- var updateField = "count." + tweet.trained_language
- , update = {$inc: {}};
- update.$inc[updateField] = 1;
- Probability.update({word: word}, update, {upsert: true}, cb);
- }, cb);
+ var words = tweet.getWords();
+
+ //don't process short tweets with fewer than 4 words
+ if(words.length > 3){
+ async.forEach(tweet.getWords(), function(word, cb){
+ var updateField = "count." + tweet.trained_language
+ , update = {$inc: {}};
+ update.$inc[updateField] = 1;
+ Probability.update({word: word}, update, {upsert: true}, cb);
+ }, cb);
+ } else {
+ cb();
+ }
}
},
cb

0 comments on commit 787816d

Please sign in to comment.
Something went wrong with that request. Please try again.