Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base: 9183173fa4
...
compare: 39ab2d59c2
  • 5 commits
  • 3 files changed
  • 0 commit comments
  • 1 contributor
Commits on May 04, 2012
@akavlie Remove classification of words ending in '-ing' as verbs.
This doesn't hold for several words: thing, fling, sing, etc.
0001fc7
@akavlie Add merge() function to mimic set functionality.
ActivityItem code repeats a pattern of searching through an existing array for existing
matches before pushing a new term. Was able to cut down on most of those by
extracting to a merge() function.

Also includes a bit of cleanup and commenting.
dfcb768
@akavlie Function call spaces b gone 37b338f
@akavlie Correct variable in merge() function 1a5a510
@akavlie Misc. tidying 39ab2d5
View
2  controllers/AnalysisController.js
@@ -74,6 +74,8 @@ module.exports = {
}
function analyze_item (_item, cb) {
+ console.log('AnalysisController analyze_item()');
+
var error = null;
var item = _item;
View
1  controllers/TwitterController.js
@@ -37,7 +37,6 @@ module.exports = {
var attr = task.attributes || {};
if (attr.connected && attr.last_ping.getTime() > Date.now() - stream_timeout) {
- console.log('already connected')
return res.send("Already streaming...");
}
View
189 models/ActivityItem.js
@@ -4,17 +4,25 @@
var mongoose = require('mongoose'),
unshorten = require('unshorten'),
- Schema = mongoose.Schema,
natural = require('natural'),
+ Schema = mongoose.Schema,
ObjectId = Schema.ObjectId;
-var tokenizer = new natural.TreebankWordTokenizer();
+var NGrams = natural.NGrams,
+ wordnet = new natural.WordNet('./cache'),
+ tokenizer = new natural.TreebankWordTokenizer();
natural.LancasterStemmer.attach();
-var wordnet = new natural.WordNet('./cache');
-var NGrams = natural.NGrams;
+
+// Add an item to the array, if it doesn't already exist,
+// in order to mimic a set
+function merge(array, newItem) {
+ array.forEach(function(i) {
+ if (i === newItem) return;
+ });
+ array.push(newItem);
+}
var ActivityItemSchema = new Schema({
-
guid: {type: String, unique: true, index: true, required: true},
user: {type: ObjectId, index: true, ref: "Identity"},
message: {type: String},
@@ -33,18 +41,18 @@ var ActivityItemSchema = new Schema({
activity: [{}],
analyzed_at: {type: Date, default: (function () { return new Date(1); })},
created_at: {type: Date, default: Date.now}
-
});
-ActivityItemSchema.methods.analyze = function (cb) {
+ActivityItemSchema.methods.analyze = function(cb) {
var self = this;
var item = self;
var AI = this;
- (function analyze_me (item, cb) {
- var mongoose = require('mongoose');
- var Topic = mongoose.model('Topic');
- var JunkTopic = mongoose.model('JunkTopic');
- var AI = mongoose.model('ActivityItem');
+ (function analyze_me(item, cb) {
+ console.log('Schema analyze_me');
+
+ var Topic = mongoose.model('Topic'),
+ JunkTopic = mongoose.model('JunkTopic'),
+ AI = mongoose.model('ActivityItem');
var error = null;
@@ -57,50 +65,42 @@ ActivityItemSchema.methods.analyze = function (cb) {
var message = item.message.toLowerCase();
var url_pattern = /\(?\bhttps?:\/\/[-A-Za-z0-9+&@#\/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#\/%=~_()|]/gi;
- var urls = message.match(url_pattern);
var hash_pattern = /#[a-zA-Z_0-9]*/gi;
- var hashtags = message.match(hash_pattern);
- urls = urls || [];
- hashtags = hashtags || [];
+ var urls = message.match(url_pattern) || [];
+ var hashtags = message.match(hash_pattern) || [];
var keywords = [];
- urls.forEach(function (url) {
- domain = url.substr(url.indexOf("//")+2);
- domain = domain.substr(0, domain.indexOf("/"));
- var found = false;
- keywords.forEach(function (existing) {
- if (existing == domain) {
- found = true;
- }
- });
- if (!found) {
- keywords.push(domain);
- }
+
+ // Add URLs to list of keywords
+ urls.forEach(function(url) {
+ var domain = url.substr(url.indexOf("//") + 2);
+ // Cut off at first slash in URL, if one exists
+ domain = domain.substr(0, domain.indexOf("/")) || domain;
+ merge(keywords, domain);
});
- hashtags.forEach(function (tag) {
+
+ // Add hashtags
+ hashtags.forEach(function(tag) {
tag = tag.substring(1);
- var found = false;
- keywords.forEach(function (existing) {
- if (existing == tag) {
- found = true;
- }
- });
- if (!found) {
- keywords.push(tag);
- }
+ merge(keywords, tag);
});
- message = message.remove_urls().remove_hashtags().remove_screen_names().replace_punctuation();
+ // Natural language analysis of remainder of message
+ message = message
+ .remove_urls()
+ .remove_hashtags()
+ .remove_screen_names()
+ .replace_punctuation();
var chunks = message.split(" ");
var new_chunks = [];
- chunks.forEach(function (chunk) {
+ chunks.forEach(function(chunk) {
+ // Not sure the purpose of this
if (chunk.indexOf("'") == -1) {
new_chunks.push(chunk);
}
});
- message = chunks.join(" ");
- var words = tokenizer.tokenize(message);//.tokenizeAndStem();
+ var words = tokenizer.tokenize(message);
var ngram_length = words.length;
var phrases = [];
@@ -138,16 +138,7 @@ ActivityItemSchema.methods.analyze = function (cb) {
if (!err && topics) {
topics.forEach(function (topic) {
- var found = false;
- words.forEach(function (word) {
- if (word == topic.text) {
- found = true;
- }
- });
- if (!found) {
- console.log("ADDING FROM PHRASE: "+topic.text);
- words.push(topic.text);
- }
+ merge(words, topic.text);
});
}
remove_topics_from_words();
@@ -213,24 +204,20 @@ ActivityItemSchema.methods.analyze = function (cb) {
var verb = 0;
if (word.length > 3) {
- //console.log("> Word: "+word);
- if (word.substring(0, word.length-3) == "ing") {
- verb = 100;
- classify_word(word, noun, verb, neither);
- } else
+ // Numbers
if (word.match(/^[0-9]*$/)) {
classify_word(word, noun, verb, neither);
+ // Everything else
} else {
wordnet.lookup(word, function(results) {
-
results.forEach(function(result) {
if (result.pos == "n") {
noun++;
- } else
- if (result.pos == "v") {
+ } else if (result.pos == "v") {
verb++;
- } else
- if (result.pos == "a" || result.pos == "r" || result.pos == "s") {
+ } else if (result.pos == "a" ||
+ result.pos == "r" ||
+ result.pos == "s") {
neither++;
}
});
@@ -238,43 +225,35 @@ ActivityItemSchema.methods.analyze = function (cb) {
});
return;
}
+ // Automatically junk any word under 4 characters
} else {
save_junk_topic(word);
lookup_next_word();
}
}
- function classify_word (w, n, v, neither) {
- if (n == 0 && neither > v) {
+
+ function classify_word(w, n, v, neither) {
+ if (n === 0 && neither > v) {
save_junk_topic(w);
- //console.log(w+": is ("+n+":"+v+"/"+neither+") I don't know.. Junk?");
} else {
- //console.log(w+": noun:"+n+" / verb:"+v+" / neither:"+neither);
- var found = false;
- keywords.forEach(function (existing) {
- if (existing == w) {
- found = true;
- }
- });
- if (!found) {
- keywords.push(w);
- }
+ merge(keywords, w);
}
lookup_next_word();
}
- function save_junk_topic (word) {
- JunkTopic.findOne({text: word}, function (err, topic) {
+ function save_junk_topic(word) {
+ JunkTopic.findOne({text: word}, function(err, topic) {
if (err || (topic && topic.text == word)) {
return;
}
var junker = new JunkTopic({text: word});
- junker.save(function (err) {
+ junker.save(function(err) {
// err?
});
});
}
- function add_topics () {
+ function add_topics() {
var existing_topics = [];
//console.log("Getting topics");
@@ -287,7 +266,7 @@ ActivityItemSchema.methods.analyze = function (cb) {
done_with_topics();
}
- function add_new_topics () {
+ function add_new_topics() {
var new_topics = [];
keywords.forEach(function (keyword) {
var found = false;
@@ -301,49 +280,41 @@ ActivityItemSchema.methods.analyze = function (cb) {
}
});
- function add_each_topic () {
+ function add_each_topic() {
if (new_topics.length == 0) {
return done_adding();
}
topic_text = new_topics.shift();
var topic = new Topic({text: topic_text, ratings: {overall: 0}});
- topic.save(function (err) {
+ topic.save(function(err) {
add_each_topic();
});
}
add_each_topic();
}
- function done_adding () {
+ function done_adding() {
var topic_ids = [];
var topic_texts = [];
- Topic.find({text: {"$in": keywords}}, function (err, t) {
- t.forEach(function (topic) {
+ Topic.find({text: {"$in": keywords}}, function(err, t) {
+ t.forEach(function(topic) {
topic_ids.push(topic.id);
topic_texts.push(topic.text);
});
if (!item.topics || !(item.topics.length > 0)) {
item.topics = topic_ids;
} else {
- topic_ids.forEach(function (new_topic) {
- var found = false;
- item.topics.forEach(function (existing) {
- if (new_topic == existing) {
- found = true;
- }
- });
- if (!found) {
- item.topics.push(new_topic);
- }
+ topic_ids.forEach(function(new_topic) {
+ merge(item.topics, new_topic);
});
}
//console.log("ADDED TOPIC IDS! "+item.topics.length);
//console.log(self.message);
//console.log("Topics: "+topic_texts.join(", "));
item.commit("topics");
- item.save(function (err) {
- t.forEach(function (topic) {
- topic.save(function (err) {
+ item.save(function(err) {
+ t.forEach(function(topic) {
+ topic.save(function(err) {
// err
});
});
@@ -354,13 +325,13 @@ ActivityItemSchema.methods.analyze = function (cb) {
}
- function done_with_topics () {
+ function done_with_topics() {
//console.log("calling back... "+item.topics.length);
rate_that_shit();
}
- function rate_that_shit () {
+ function rate_that_shit() {
var topic_ratings = 0;
var topic_count = 0;
var char_ratings = 0;
@@ -372,9 +343,9 @@ ActivityItemSchema.methods.analyze = function (cb) {
.populate("characteristics")
.populate("topics")
.populate("user")
- .run(function (err, _item) {
+ .run(function(err, _item) {
if (!err && _item) {
- _item.topics.forEach(function (topic) {
+ _item.topics.forEach(function(topic) {
if (topic.ratings.overall > 0 || topic.ratings.overall < 0) {
if (parseInt(topic.ratings.overall) != 0) {
topic_ratings += parseFloat(topic.ratings.overall);
@@ -386,7 +357,7 @@ ActivityItemSchema.methods.analyze = function (cb) {
factors.push(topic_ratings/topic_count);
}
- _item.characteristics.forEach(function (ch) {
+ _item.characteristics.forEach(function(ch) {
if (parseFloat(ch.ratings.overall) != 0) {
char_ratings += parseFloat(ch.ratings.overall)*.2;
char_count++;
@@ -433,7 +404,7 @@ ActivityItemSchema.methods.analyze = function (cb) {
}
-ActivityItemSchema.pre('save', function (next) {
+ActivityItemSchema.pre('save', function(next) {
var self = this;
if (!this.ratings) {
@@ -445,10 +416,10 @@ ActivityItemSchema.pre('save', function (next) {
var orig = self.message
//console.log("Starting message: "+self.message);
- unshorten_urls(self.message, function (m) {
+ unshorten_urls(self.message, function(m) {
//console.log("New message (1): "+m);
if (self.message != m) {
- unshorten_urls(m, function (m) {
+ unshorten_urls(m, function(m) {
//console.log("New message (2): "+m);
self.message = m;
next();
@@ -486,7 +457,7 @@ ActivityItemSchema.pre('save', function (next) {
/**/
});
-function unshorten_urls (_message, cb) {
+function unshorten_urls(_message, cb) {
var regex = /\(?\bhttps?:\/\/[-A-Za-z0-9+&@#\/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#\/%=~_()|]/gi;
var matches = _message.match(regex);
if (matches) {
@@ -498,7 +469,7 @@ function unshorten_urls (_message, cb) {
if (match.length > 28) {
unshorten_next();
} else {
- unshorten(match, function (url) {
+ unshorten(match, function(url) {
if (match != url) {
match = match.replace(/ /g, "%20");
_message = _message.replace(match, url);
@@ -508,7 +479,7 @@ function unshorten_urls (_message, cb) {
}
}
unshorten_next();
- function finished () {
+ function finished() {
cb(_message);
}
} else {

No commit comments for this range

Something went wrong with that request. Please try again.