latest changes

atlang · Mar 31, 2014 · ebbdf9e · ebbdf9e
1 parent 439f313
commit ebbdf9e
Show file tree

Hide file tree

Showing 3 changed files with 128 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -1,16 +1,22 @@
 # Ace
 
-Ace is an extensible, statically-typed programming language that borrows Python's syntax and uses Python as a type-level metalanguage. Or to put it more simply -- it is embedded within Python.
+Ace is a programming language with a modularly extensible static type system. It borrows Python's syntax directly and uses Python as a  compile-time metalanguage. That is, to put it more simply, Ace is embedded within Python.
 
 # Extensions
-## OpenCL
+## Ace.OpenCL
 
-The most well-developed set of extensions included with Ace implement the entirety of the OpenCL kernel programming language, (and include a wrapper around the host API as well). Ace is the best way to write low-level GPU kernels in OpenCL today, no contest.
+The most well-developed set of extensions included with Ace implement the entirety of the OpenCL kernel programming language (and include a convenient wrapper around the host API as well, although its use is optional). 
 
-## C99
+Ace is the best way to write low-level GPU kernels in OpenCL today, no contest.
+
+## Ace.C99
 
 A not-quite-complete set of extensions that implement the C99 programming language are also included.
 
+## Ace.FP
+
+Some functional programming constructs as a demonstration.
+
 # Installation Instructions
 ## Unix, Linux and Mac
 
@@ -21,4 +27,5 @@ A not-quite-complete set of extensions that implement the C99 programming langua
 # License
 
 # Contributors
-Cyrus Omar (http://www.cs.cmu.edu/~comar)
+Ace was conceived and implemented by [[Cyrus Omar|http://www.cs.cmu.edu/~comar]]. Some extensions and work on correctness checking were contributed by Nathan Fulton.
+
diff --git a/cypy/ngram.py b/cypy/ngram.py
@@ -4,13 +4,15 @@ def sum_leaves(d):
     for value in d.itervalues():
         if isinstance(value, dict):   # TODO: use correct checks
             sum += sum_leaves(value)
-        elif isinstance(value, int):
+        elif isinstance(value, int):  # TODO: check for numbers more generally
             sum += value
     return sum
 
 class Corpus(object):
     """Represents the n-gram frequencies derived from some corpus."""
-    def __init__(self, data):
+    def __init__(self, data=None):
+        if data is None:
+            data = {}
         self.data = data
 
     data = None
@@ -80,29 +82,100 @@ def get_count(self, tokens):
             except KeyError:
                 return 0
 
-        c = cur[tokens[0]]
-        if isinstance(c, dict):  # TODO: use correct way of checking for dictionary
-            return sum_leaves(c)
+        try:
+            c = cur[tokens[0]]
+        except KeyError:
+            return 0
         else:
-            return c
+            if isinstance(c, dict):  # TODO: use correct way of checking for dictionary
+                return sum_leaves(c)
+            else:
+                return c
 
-    def prob(self, token, lead_tokens):
-        """Returns the probability of the given tokens given the lead tokens.
+    def prob(self, token, prefix):
+        """Returns the probability of the given token given the lead tokens.
 
             >>> corpus.prob('in', ['the', 'cat'])
             0.9
 
         """
-        lead_count = float(self.get_count(lead_tokens))
-        tokens = list(lead_tokens); tokens.append(token)
+        prefix_count = float(self.get_count(prefix))
+        tokens = list(prefix); tokens.append(token)
         count = float(self.get_count(tokens))
-        return count / lead_count
+        if prefix_count == 0:
+            return 0
+        else:
+            return count / prefix_count
+
+    def seq_prob(self, tokens, prefix):
+        """Returns the probability of a given sequence given the lead tokens."""
+        assert len(tokens) > 0
+        prefix = list(prefix)
+        p = 1.0
+        for token in tokens:
+            p *= self.prob(token, prefix)
+            prefix.pop(0)
+            prefix.append(token)
+        return p
 
 def test_filename(filename):
-    lines = open(filename).readlines()
-    corpus = Corpus.from_cmulm_lines(lines)
+    corpus = Corpus.from_cmulm_filename(filename)
     print corpus.get_count(["!=", "null"])
     print corpus.prob("&&", ["!=", "null"])
+    print corpus.seq_prob(["%", "ROW_MAX)", "==", "ROW_MAX", "-", "1)"], ["if", "((i"])
+
+# TODO: pull this out into another file
+#import cypy.ngram
+import re, numpy
+
+def tokenize(expr):
+    return [x for x in re.split(r"(\W)", expr) if re.match(r"\w", x)]
+
+def test_expressions(corpus_filename, test_filename):
+    corpus = Corpus.from_cmulm_filename(corpus_filename)
+    lines = open(test_filename).readlines()
+    prob = numpy.empty((len(lines),))
+    for i, line in enumerate(lines):
+        try:
+            prefix1, prefix2, expr = line.strip().split(" ", 2)
+        except ValueError:
+            print "BAD"
+            prob[i] = 0.0
+            continue
+        prefix = [prefix1, prefix2]
+        tokens = tokenize(expr)
+        prob[i] = corpus.seq_prob(tokens, prefix)
+
+    return prob
+
+def process_flavio_data(filename, training_dir, training_out, test_out):
+    lines = open(filename).readlines()
+    num_training_files = int(lines[0])
+    training_files = []
+    for i in xrange(num_training_files):
+        training_files.append(lines[i + 1].strip())
+    write_training_file(training_files, training_dir, training_out)
+    test_lines = lines[i + num_training_files + 1:]
+    write_test_file(test_lines, test_out)
+
+def write_training_file(training_files, training_dir, training_out):
+    training = ""
+    for f in training_files:
+        training += open(training_dir + f).read()
+    with open(training_out, 'w') as f:
+        f.write(training)
+
+def write_test_file(test_lines, test_out):
+    test = "".join(test_lines)
+    with open(test_out, 'w') as f:
+        f.write(test)
+
+def run_tests(n, corpus_filename_fmt, test_filename_fmt):
+    probs = [ ]
+    for i in xrange(n):
+        prob = test_expressions(corpus_filename_fmt % i, test_filename_fmt % i)
+        probs.append(numpy.mean(prob))
+    print numpy.mean(probs)
 
 if __name__ == "__main__":
     import sys

diff --git a/cypy/ngram_scratch.txt b/cypy/ngram_scratch.txt
@@ -0,0 +1,30 @@
+/Users/cyrus/Dropbox/Class/ML/project/data/axion
+/Users/cyrus/Dropbox/Class/ML/project/projects/axion
+
+corpus_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/batik/corpus%d.txt"
+test_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/aoi/test%d.txt"
+
+for i in xrange(10):
+	import ngram; ngram.process_flavio_data("/Users/cyrus/Dropbox/Class/ML/project/data/batik/data.tokens%d" % i, "/Users/cyrus/Dropbox/Class/ML/project/projects/batik/", "/Users/cyrus/Dropbox/Class/ML/project/data/batik/training%d.txt" % i, "/Users/cyrus/Dropbox/Class/ML/project/data/batik/test%d.txt" % i)
+
+corpus_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/ant/corpus2.txt"
+test_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/ant/test2.txt"
+
+import ngram; prob = ngram.test_expressions(corpus_fmt, test_fmt)
+
+corpus_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/batik/corpus%d.txt"
+test_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/batik/test%d.txt"
+
+import ngram; ngram.run_tests(10, corpus_fmt, test_fmt)
+
+run_tests()
+- generate training/test/corpus for all the files
+- run the ngram predictor on all the files
+- average all the cross-validation folds
+
+axion 0.0333449768082
+aoi 0.058308169860052002
+findbugs = 0.055
+antlr = 0.0311456748392
+ant = 0.048
+batik = 0.056