In [71]:
import random
from scipy.stats import bernoulli


In [5]:
#Configuration
MAX_NESTING = 10
BRANCHING = 1

## Language 1: Just (), exact nesting

In [6]:
class L1():
    def __init__(self, nest, branch):
        #all possible symbold (excluding the terminator to have exact nesting)
        self.lang_symbols = {i: "("+(i+1)*"S"+")" for i in xrange(branch)}
        self.lan_terminator = "()"
        self.nesting = nest
    
    def generate_word(self):
        curr_string = "S"
        #only generate predetermined nestings (exactly this deeply nested)
        for i in xrange(self.nesting):
            #only if there are S to substitute
            if curr_string.find("S") != -1:
                #split into substrings (so that you can generate different values for every occurence of S)
                w =  [e+"S" for e in curr_string.split("S")]
                #remove the S from the last substring (it does not belong there)
                w[-1] = w[-1][:-1]
                #initialize the current string 
                curr_string = ""
                for part_string in w:
                    #generate substitution for every S
                    new_part = part_string.replace("S", random.choice(self.lang_symbols.values()))
                    curr_string += new_part
        #terminate string
        return curr_string.replace("S", self.lan_terminator)
    
    def generate_language(self, length):
        return " ".join([self.generate_word() for i in xrange(length)])
    

In [7]:
model = L1(MAX_NESTING, BRANCHING)
print model.generate_word()
print model.generate_language(10)


((((((((((()))))))))))
((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((())))))))))) ((((((((((()))))))))))


## Language 2: Just (), probablistic nesting


In [9]:
class L2():
    def __init__(self, nest, branch):
        #all possible symbold (excluding the terminator to have exact nesting)
        self.lang_symbols = {i: "("+i*"S"+")" for i in xrange(branch+1)}
        self.lan_terminator = "()"
        self.nesting = nest
    
    def generate_word(self):
        curr_string = "S"
        #only generate predetermined nestings (exactly this deeply nested)
        for i in xrange(self.nesting):
            #only if there are S to substitute
            if curr_string.find("S") != -1:
                #split into substrings (so that you can generate different values for every occurence of S)
                w =  [e+"S" for e in curr_string.split("S")]
                #remove the S from the last substring (it does not belong there)
                w[-1] = w[-1][:-1]
                #initialize the current string 
                curr_string = ""
                for part_string in w:
                    #generate substitution for every S
                    new_part = part_string.replace("S", random.choice(self.lang_symbols.values()))
                    curr_string += new_part
        #terminate string
        return curr_string.replace("S", self.lan_terminator)
    
    def generate_language(self, length):
        return " ".join([self.generate_word() for i in xrange(length)])

In [10]:
model = L2(100, 1)
print model.generate_word()
#print model.generate_language(10000)
#model_l1 = model.generate_language(10000000)
#text_file = open("L1.txt", "w")
#text_file.write(model_l1)
#text_file.close()

(())


In [11]:
model = L1(5, 2)
#print model.generate_word()
print model.generate_language(10)

(((((()())))(((()()))))((((()())(())))(((())(()))((())(()()))))) (((((())(()()))))) (((((()())))(((())(()()))))) (((((()())(()()))((()()))))) (((((())))(((()()))((()))))) (((((())))(((()()))))) (((((()))))((((())(()())))(((()()))))) (((((()))((())))(((()())(()))))) (((((()))))((((()())(()()))))) (((((()()))((()())(()()))))((((())))))


##Language 3 - primitive Dyck Language



In [84]:
class L3():
    def __init__(self, nest, prob_deeper):
        #all possible symbold (excluding the terminator to have exact nesting)
        self.lang_symbols = ["", "(S)S"]
        self.lan_terminator = ""
        self.nesting = nest
        self.bern = bernoulli(prob_deeper)
    
    def generate_word(self):
        curr_string = "(S)S"
        #only generate predetermined nestings (exactly this deeply nested)
        for i in xrange(self.nesting):
            #only if there are S to substitute
            if curr_string.find("S") == -1:
                break
            #split into substrings (so that you can generate different values for every occurence of S)
            w =  [e+"S" for e in curr_string.split("S")]
            w[-1] = w[-1][:-1]

            #initialize the current string 
            curr_string = ""
            for part_string in w:
                #generate substitution for every S
                new_part = part_string.replace("S", self.lang_symbols[self.bern.rvs()])
                curr_string += new_part
        #terminate string
        return curr_string.replace("S", self.lan_terminator)
    
    def generate_language(self, length):
        return "".join([self.generate_word() for i in xrange(length)])

In [85]:
L3(10, .75).generate_word()

'(((((((((()())(()))(()())(())())())((((())())(())())()(())())()((())())(())())((()()))((()(())())(()())())(()(())())(()())(())())()()(())(()(())())((())()))(()(((()(()))((())())(())())(((())())(())())(()()))((((())())(()))((()))(())())((()())(())()))()()(())()((())())(()))((((()(((())()))((()))(())())((()()()))(((())())))(())())()()(()))(((((((()))(())()))()()(())()))((()(()()))())(()((())())()()))((()(()()()))((()(())()))(()(())())((())))(((((())())(())())((())()))(((())()))()(())())((((())())(())())((())())(())())(()(())())((())()))((((()((((())())(())()))(((()))))())))((()()()((()()))()(())())()(((((()))(())())()(())())(((())())()())((())())(())())((((())())(())))(((())()))(())()())()(())(((((())())()())()))()'

In [87]:
model = L3(10, .75)
%time model_l1 = model.generate_language(30000)
text_file = open("dyck.txt", "w")
text_file.write(model_l1)
text_file.close()

CPU times: user 2min 29s, sys: 2.13 s, total: 2min 31s
Wall time: 2min 40s
