Implement user_words algorithm, that assigns lambdas based on the com…

…monly used words of each user. Gives a positive score so far, and a success of about 50%. Testing this now with run_algorithms.
amrav · Jan 26, 2012 · d47cb8f · d47cb8f
1 parent 7e88d61
commit d47cb8f
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 3 deletions.
diff --git a/algorithms/user_words.py b/algorithms/user_words.py
@@ -0,0 +1,87 @@
+from __future__ import division
+import re
+import math
+
+class user:
+    def __init__(self):
+        self.words = {}
+
+params = []
+ranges = []
+steps = []
+
+params += [10] #match_threshold
+ranges += [(5,15)]
+steps += [2]
+
+params += [80] #lambda_threshold
+ranges += [(20,99)]
+steps += [5]
+
+def build_user_words(statements):
+    users = {}
+    for stat in statements:
+        if stat.issued_by != '$$$':
+            if stat.issued_by not in users:
+                users[stat.issued_by] = user()
+            matches = re.findall(r"\W?(\w+)\W?", stat.text_str)
+            for match in matches:
+                if len(match)>3:
+                    ##print users[stat.issued_by]
+                    if match not in users[stat.issued_by].words:
+                        users[stat.issued_by].words[match] = 0
+                    users[stat.issued_by].words[match] += 1
+    return users
+
+def run(statements):
+
+    match_threshold = params[0]
+    lambda_threshold = params[1]/100
+
+    users = build_user_words(statements)
+
+    ''' for luser in users:
+        maxes = sorted(users[luser].words, key = lambda x: users[luser].words[x], reverse=True)[:5]
+        print luser, ':',
+        for max in maxes:
+            print max, users[luser].words[max], ';' ,
+        print'''
+
+    userscore = {}
+    for stat in statements:
+##        stat.print_details()
+        matches = re.findall(r"\W?(\w+)\W?", stat.text_str)
+        if stat.issued_by == '$$$':
+            userscore = {}
+            for luser in users:
+                for match in matches:
+                    if len(match) > 3 and match in users[luser].words and users[luser].words[match] > match_threshold:
+                        if luser not in userscore:
+                            userscore[luser] = 0
+                        userscore[luser] += users[luser].words[match]
+
+            if len(userscore) != 0:
+                avg = sum([math.log(x) for x in userscore.values()])
+                avg /= len(userscore)
+                if avg != 0:
+                    for luser in userscore:
+                        sc = math.log(userscore[luser])/avg
+                        if sc != 0:
+                            stat.alg_lambda[luser] = sc
+                else:
+                    stat.alg_lambda[luser] = {}
+            else:
+                stat.alg_lambda = {}
+
+            maxes = sorted(stat.alg_lambda.values(), reverse = True)
+            if len(maxes) > 2:
+                if maxes[1]/maxes[0] > lambda_threshold:
+                    stat.alg_lambda = {}
+
+
+
+
+
+
+
+
diff --git a/settings.py b/settings.py
@@ -4,13 +4,14 @@
 import line_context
 import bracket
 import addressal
+import user_words
 
 #for test_algorithm.py
 prev_display_scope = 4
 next_display_scope = 4
 
 alg_list = [line_context, bracket, addressal]
-test_alg = addressal
-opt_alg = addressal
-opt_runs = 3
+test_alg = user_words
+opt_alg = user_words
+opt_runs = 5