In [1]:
import random
from scipy.stats import bernoulli


In [2]:
#Configuration
MAX_NESTING = 10
BRANCHING = 1

## Language 1: Just (), exact nesting

In [3]:
class L1():
    def __init__(self, nest, branch):
        #all possible symbold (excluding the terminator to have exact nesting)
        self.lang_symbols = {i: "("+(i+1)*"S"+")" for i in xrange(branch)}
        self.lan_terminator = "()"
        self.nesting = nest
    
    def generate_word(self):
        curr_string = "S"
        #only generate predetermined nestings (exactly this deeply nested)
        for i in xrange(self.nesting):
            #only if there are S to substitute
            if curr_string.find("S") != -1:
                #split into substrings (so that you can generate different values for every occurence of S)
                w =  [e+"S" for e in curr_string.split("S")]
                #remove the S from the last substring (it does not belong there)
                w[-1] = w[-1][:-1]
                #initialize the current string 
                curr_string = ""
                for part_string in w:
                    #generate substitution for every S
                    new_part = part_string.replace("S", random.choice(self.lang_symbols.values()))
                    curr_string += new_part
        #terminate string
        return curr_string.replace("S", self.lan_terminator)
    
    def generate_language(self, length):
        return " ".join([self.generate_word() for i in xrange(length)])
    

In [4]:
model = L1(MAX_NESTING, BRANCHING)
print model.generate_word()
print model.generate_language(10)


((((((((((()))))))))))
((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((()))))))))))


## Language 2: Just (), probablistic nesting


In [5]:
class L2():
    def __init__(self, nest, branch):
        #all possible symbold (excluding the terminator to have exact nesting)
        self.lang_symbols = {i: "("+i*"S"+")" for i in xrange(branch+1)}
        self.lan_terminator = "()"
        self.nesting = nest
    
    def generate_word(self):
        curr_string = "S"
        #only generate predetermined nestings (exactly this deeply nested)
        for i in xrange(self.nesting):
            #only if there are S to substitute
            if curr_string.find("S") != -1:
                #split into substrings (so that you can generate different values for every occurence of S)
                w =  [e+"S" for e in curr_string.split("S")]
                #remove the S from the last substring (it does not belong there)
                w[-1] = w[-1][:-1]
                #initialize the current string 
                curr_string = ""
                for part_string in w:
                    #generate substitution for every S
                    new_part = part_string.replace("S", random.choice(self.lang_symbols.values()))
                    curr_string += new_part
        #terminate string
        return curr_string.replace("S", self.lan_terminator)
    
    def generate_language(self, length):
        return " ".join([self.generate_word() for i in xrange(length)])

In [6]:
model = L2(100, 1)
print model.generate_word()
#print model.generate_language(10000)
#model_l1 = model.generate_language(10000000)
#text_file = open("L1.txt", "w")
#text_file.write(model_l1)
#text_file.close()

(())


In [7]:
model = L1(5, 2)
#print model.generate_word()
print model.generate_language(10)

(((((()())(()()))((()())))(((()()))((()))))((((()())(()()))((()()))))) (((((()))((()())(()))))((((())(()())))(((()())(()()))((())(()()))))) (((((())(()()))((())(()())))(((())(()()))((()))))((((()()))))) (((((()())))(((()())(()()))((())(()()))))((((()))))) (((((()()))((()())(())))(((()()))))) (((((()())))(((()))((()())(()))))((((()())(()()))((()))))) (((((())))(((())(()))))((((())(()))((())(()))))) (((((()())(()))((())))(((()))((()()))))((((())(()()))((()())(()))))) (((((()())))(((())(()()))))((((())(()())))(((())(()()))((()())(()))))) (((((()())(()))((()())(()()))))((((()()))((()())))))


##Language 3 - primitive Dyck Language


In [12]:
class L3():
    def __init__(self, nest, prob_deeper):
        #all possible symbold (excluding the terminator to have exact nesting)
        self.lang_symbols = ["", "(S)S"]
        self.lan_terminator = ""
        self.nesting = nest
        self.bern = bernoulli(prob_deeper)
    
    def generate_word(self):
        curr_string = "(S)S"
        #only generate predetermined nestings (exactly this deeply nested)
        for i in xrange(self.nesting):
            #only if there are S to substitute
            if curr_string.find("S") == -1:
                break
            #split into substrings (so that you can generate different values for every occurence of S)
            w =  [e+"S" for e in curr_string.split("S")]
            w[-1] = w[-1][:-1]

            #initialize the current string 
            curr_string = ""
            for part_string in w:
                #generate substitution for every S
                new_part = part_string.replace("S", self.lang_symbols[self.bern.rvs()])
                curr_string += new_part
        #terminate string
        return curr_string.replace("S", self.lan_terminator)
    
    def generate_language(self, length):
        return "".join([self.generate_word() for i in xrange(length)])

In [9]:
L3(10, .75).generate_word()

'()(((((()())((((())()))((()))(())())))(()((((()))(()))())))(())(())((()((())())(())()))()()(()())(()))((()(((((())())(())))(()(())())()()()))()()()(()()())((())())(())())((((())(((())())(()))((())())(())())((((())())(()))((())())(()))()()(())())(((()(()))))((((())()))))((()(((())())(())())))'

In [10]:
def get_length_of_language(v):
    base = 30000
    if v==4:
        return base * 40
    elif v==5:
        return base * 15
    elif v==6:
        return base * 5
    elif v==7:
        return base * 2
    elif v==8:
        return int(base * .6)
    elif v==9:
        return int(base * .2)
    else:
        return base
    
def create_lan_files():
    for i in xrange(4,9):
        print i+1
        curr_chance = (i+1)/10.
        model = L3(10, curr_chance)
        fname = "dyck-" + str(curr_chance)+".txt"
        lang = model.generate_language(get_length_of_language(i+1))

        text_file = open(fname, "w")
        text_file.write(lang)
        text_file.close()

##Language 4 - Dyck Language with 4 Symbols

In [13]:
class L4():
    def __init__(self, nest, prob_deeper):
        #all possible symbold (excluding the terminator to have exact nesting)
        self.lang_symbols = ["[S]S", "(S)S"]
        self.lan_terminator = ""
        self.nesting = nest
        self.bern = bernoulli(prob_deeper)
    
    def generate_word(self):
        curr_string = "(S)S"
        #only generate predetermined nestings (exactly this deeply nested)
        for i in xrange(self.nesting):
            #only if there are S to substitute
            if curr_string.find("S") == -1:
                break
            #split into substrings (so that you can generate different values for every occurence of S)
            w =  [e+"S" for e in curr_string.split("S")]
            w[-1] = w[-1][:-1]

            #initialize the current string 
            curr_string = ""
            for part_string in w:
                #generate substitution for every S
                new_sub = ""
                if self.bern.rvs():
                    new_sub = random.choice(self.lang_symbols)
                new_part = part_string.replace("S", new_sub)
                curr_string += new_part
        #terminate string
        return curr_string.replace("S", self.lan_terminator)
    
    def generate_language(self, length):
        return "".join([self.generate_word() for i in xrange(length)])

In [16]:
dyck2 = L4(10, .9)
dyck2.generate_word()

'()(((([[[[[()][]]()[]][][()][]][(([])[])]()[[]]()]([]((())[])(())[]))[(([([])[]]()[])[(())[]](())[])[[[()]()][()][]]([[]]())()()])[[([((()))[()][]][[()][]][[]][])([[()]]([])[])[[()]()](())[]]][[[([[]]())[[]]()]((())())[()][]]][()[(())()][()]()]([(())[]]()[]))(((([[[]()](())][([])()]([])())([([])()]())((())[])[()][])(([(())[]](())[]))(((())[])([])[])[([])[]][[]]())[[[[[[]]()]]()[[]]()][((())())(())()][[()][]][()]()][([[()][]])([()]())[]()])()([(([()]())([])())[(())[]][[]]][(([])())[()][]]([()]())[()]())[[(([])[])([])][(())()][()]()][[[][]][[]][]]((())())[()])[]([([[][[[]][]](())]([[()][]][[]]()))([((())[])[[]]()]([[]][])([])())([[()][]][()]())[[()]][[]][]][]([((())[])([])()][(())()][[]][]))([[([([])()][()][])()[]()][[[()][]](())()][[[]]]][((([])())([])[])[(())()][[]][]][[([])()](())[]][[()][]]([]))(((([[]])[()][])([()]())([])())[[[[]][]][[]]]([][])()())(((([])())[[]][])(()())[()][])[][([])[]][()]()'

In [18]:
def create_lan_files_L4():
    for i in xrange(4,9):
        print i+1
        curr_chance = (i+1)/10.
        model = L4(10, curr_chance)
        fname = "dyck2-" + str(curr_chance)+".txt"
        lang = model.generate_language(get_length_of_language(i+1))

        text_file = open(fname, "w")
        text_file.write(lang)
        text_file.close()
create_lan_files_L4()

5
6
7
8
9


In [None]:
def create_test_L4():
    
    model = L3(10, .9)
    fname = "dyck2-" + str(curr_chance)+"-test.txt"
    lang = model.generate_language(1)

    text_file = open(fname, "w")
    text_file.write(lang)
    text_file.close()
create_test_L4()