Skip to content

Commit

Permalink
latest changes
Browse files Browse the repository at this point in the history
  • Loading branch information
cyrus- committed Mar 31, 2014
1 parent 439f313 commit ebbdf9e
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 18 deletions.
17 changes: 12 additions & 5 deletions README.md
@@ -1,16 +1,22 @@
# Ace

Ace is an extensible, statically-typed programming language that borrows Python's syntax and uses Python as a type-level metalanguage. Or to put it more simply -- it is embedded within Python.
Ace is a programming language with a modularly extensible static type system. It borrows Python's syntax directly and uses Python as a compile-time metalanguage. That is, to put it more simply, Ace is embedded within Python.

# Extensions
## OpenCL
## Ace.OpenCL

The most well-developed set of extensions included with Ace implement the entirety of the OpenCL kernel programming language, (and include a wrapper around the host API as well). Ace is the best way to write low-level GPU kernels in OpenCL today, no contest.
The most well-developed set of extensions included with Ace implement the entirety of the OpenCL kernel programming language (and include a convenient wrapper around the host API as well, although its use is optional).

## C99
Ace is the best way to write low-level GPU kernels in OpenCL today, no contest.

## Ace.C99

A not-quite-complete set of extensions that implement the C99 programming language are also included.

## Ace.FP

Some functional programming constructs as a demonstration.

# Installation Instructions
## Unix, Linux and Mac

Expand All @@ -21,4 +27,5 @@ A not-quite-complete set of extensions that implement the C99 programming langua
# License

# Contributors
Cyrus Omar (http://www.cs.cmu.edu/~comar)
Ace was conceived and implemented by [[Cyrus Omar|http://www.cs.cmu.edu/~comar]]. Some extensions and work on correctness checking were contributed by Nathan Fulton.

99 changes: 86 additions & 13 deletions cypy/ngram.py
Expand Up @@ -4,13 +4,15 @@ def sum_leaves(d):
for value in d.itervalues():
if isinstance(value, dict): # TODO: use correct checks
sum += sum_leaves(value)
elif isinstance(value, int):
elif isinstance(value, int): # TODO: check for numbers more generally
sum += value
return sum

class Corpus(object):
"""Represents the n-gram frequencies derived from some corpus."""
def __init__(self, data):
def __init__(self, data=None):
if data is None:
data = {}
self.data = data

data = None
Expand Down Expand Up @@ -80,29 +82,100 @@ def get_count(self, tokens):
except KeyError:
return 0

c = cur[tokens[0]]
if isinstance(c, dict): # TODO: use correct way of checking for dictionary
return sum_leaves(c)
try:
c = cur[tokens[0]]
except KeyError:
return 0
else:
return c
if isinstance(c, dict): # TODO: use correct way of checking for dictionary
return sum_leaves(c)
else:
return c

def prob(self, token, lead_tokens):
"""Returns the probability of the given tokens given the lead tokens.
def prob(self, token, prefix):
"""Returns the probability of the given token given the lead tokens.
>>> corpus.prob('in', ['the', 'cat'])
0.9
"""
lead_count = float(self.get_count(lead_tokens))
tokens = list(lead_tokens); tokens.append(token)
prefix_count = float(self.get_count(prefix))
tokens = list(prefix); tokens.append(token)
count = float(self.get_count(tokens))
return count / lead_count
if prefix_count == 0:
return 0
else:
return count / prefix_count

def seq_prob(self, tokens, prefix):
"""Returns the probability of a given sequence given the lead tokens."""
assert len(tokens) > 0
prefix = list(prefix)
p = 1.0
for token in tokens:
p *= self.prob(token, prefix)
prefix.pop(0)
prefix.append(token)
return p

def test_filename(filename):
lines = open(filename).readlines()
corpus = Corpus.from_cmulm_lines(lines)
corpus = Corpus.from_cmulm_filename(filename)
print corpus.get_count(["!=", "null"])
print corpus.prob("&&", ["!=", "null"])
print corpus.seq_prob(["%", "ROW_MAX)", "==", "ROW_MAX", "-", "1)"], ["if", "((i"])

# TODO: pull this out into another file
#import cypy.ngram
import re, numpy

def tokenize(expr):
return [x for x in re.split(r"(\W)", expr) if re.match(r"\w", x)]

def test_expressions(corpus_filename, test_filename):
corpus = Corpus.from_cmulm_filename(corpus_filename)
lines = open(test_filename).readlines()
prob = numpy.empty((len(lines),))
for i, line in enumerate(lines):
try:
prefix1, prefix2, expr = line.strip().split(" ", 2)
except ValueError:
print "BAD"
prob[i] = 0.0
continue
prefix = [prefix1, prefix2]
tokens = tokenize(expr)
prob[i] = corpus.seq_prob(tokens, prefix)

return prob

def process_flavio_data(filename, training_dir, training_out, test_out):
lines = open(filename).readlines()
num_training_files = int(lines[0])
training_files = []
for i in xrange(num_training_files):
training_files.append(lines[i + 1].strip())
write_training_file(training_files, training_dir, training_out)
test_lines = lines[i + num_training_files + 1:]
write_test_file(test_lines, test_out)

def write_training_file(training_files, training_dir, training_out):
training = ""
for f in training_files:
training += open(training_dir + f).read()
with open(training_out, 'w') as f:
f.write(training)

def write_test_file(test_lines, test_out):
test = "".join(test_lines)
with open(test_out, 'w') as f:
f.write(test)

def run_tests(n, corpus_filename_fmt, test_filename_fmt):
probs = [ ]
for i in xrange(n):
prob = test_expressions(corpus_filename_fmt % i, test_filename_fmt % i)
probs.append(numpy.mean(prob))
print numpy.mean(probs)

if __name__ == "__main__":
import sys
Expand Down
30 changes: 30 additions & 0 deletions cypy/ngram_scratch.txt
@@ -0,0 +1,30 @@
/Users/cyrus/Dropbox/Class/ML/project/data/axion
/Users/cyrus/Dropbox/Class/ML/project/projects/axion

corpus_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/batik/corpus%d.txt"
test_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/aoi/test%d.txt"

for i in xrange(10):
import ngram; ngram.process_flavio_data("/Users/cyrus/Dropbox/Class/ML/project/data/batik/data.tokens%d" % i, "/Users/cyrus/Dropbox/Class/ML/project/projects/batik/", "/Users/cyrus/Dropbox/Class/ML/project/data/batik/training%d.txt" % i, "/Users/cyrus/Dropbox/Class/ML/project/data/batik/test%d.txt" % i)

corpus_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/ant/corpus2.txt"
test_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/ant/test2.txt"

import ngram; prob = ngram.test_expressions(corpus_fmt, test_fmt)

corpus_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/batik/corpus%d.txt"
test_fmt = "/Users/cyrus/Dropbox/Class/ML/project/data/batik/test%d.txt"

import ngram; ngram.run_tests(10, corpus_fmt, test_fmt)

run_tests()
- generate training/test/corpus for all the files
- run the ngram predictor on all the files
- average all the cross-validation folds

axion 0.0333449768082
aoi 0.058308169860052002
findbugs = 0.055
antlr = 0.0311456748392
ant = 0.048
batik = 0.056

0 comments on commit ebbdf9e

Please sign in to comment.