In [48]:
import random
from tqdm.notebook import tqdm

In [64]:
def generate_balanced_brackets(n, max_depth):
    brackets = ['(', ')']
    map = {'(': ')'}
    sequence = ''
    stack = []
    
    for i in range(n):
        open_or_close = random.choice([0, 1]) and stack
        if n-i < len(stack) or (not open_or_close and len(stack) == n-i) or len(stack) == max_depth:
            open_or_close = 1
        if open_or_close:
            sequence += map[stack.pop()]
        else:
            bracket = '('
            sequence += bracket
            if bracket in map:
                stack.append(bracket)
    
    return sequence

def is_balanced(s):
    stack = []
    max_depth = 0
    count = 0
    
    for bracket in s:
        if bracket == ')':
            if stack:
                stack.pop()
            else:
                break
        else:
            stack.append(bracket)
            max_depth = max(max_depth, len(stack))
            
    for bracket in s:
        if bracket == '(':
            count += 1
        else:
            count -= 1
    
    if stack:
        return -1 * max_depth, count
    return max_depth, count

def generate_bracket_sequence(n):
    brackets = ['(', ')']
    
    sequence = '('
    for i in range(n-1):
        sequence += random.choice(brackets)
    if is_balanced(sequence)[1] == 0:
        return sequence
    else:
        return sequence + random.choice(brackets)


In [3]:
s = generate_balanced_brackets(10, 5)
print(is_balanced(s), s)

(2, 0) ()()()(())


In [4]:
import pandas as pd


balanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])

seqs = [0 for i in range(31)]

generated_seqs = set()

for j in range(1, 31):
    i = 0
    while True:
        length = random.randint(j*2, 4000)
        length += length % 2
        
        sequence = generate_balanced_brackets(length, j)
        balanced, count = is_balanced(sequence)
        
        if seqs[balanced] == 600 or sequence in generated_seqs:
            i -= 1
            continue
            
        seqs[balanced] += 1
        
        balanced_df.loc[len(balanced_df)] = [sequence, balanced, count]
        generated_seqs.add(sequence)
        i += 1
        
        if seqs[j] >= 600:
            break
    print(seqs)

[0, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0

In [5]:
seq = generate_bracket_sequence(3)
print(seq)
is_balanced(seq)

()(


(-1, 1)

In [6]:
print(len(balanced_df))
balanced_df.head()

18000


Unnamed: 0,sequence,stack_depth,count
0,()()()()()()()()()()()()()()()()()()()()()()()...,1,0
1,()()()()()()()()()()()()()()()()()()()()()()()...,1,0
2,()()()()()()()()()()()()()()()()()()()()()()()...,1,0
3,()()()()()()()()()()()()()()()()()()()()()()()...,1,0
4,()()()()()()()()()()()()()()()()()()()()()()()...,1,0


In [65]:
unbalanced_data = [ generate_bracket_sequence(random.randint(10, 4000)) for _ in tqdm(range(len(balanced_df))) ]
print(len(unbalanced_data))
unbalanced_stack_depth = [ is_balanced(sequence) for sequence in unbalanced_data ]

unbalanced_count = [x[1] for x in unbalanced_stack_depth]
unbalanced_stack_depth = [x[0] for x in unbalanced_stack_depth]

  0%|          | 0/18000 [00:00<?, ?it/s]

18000


In [66]:

unbalanced_df = pd.DataFrame({'sequence': unbalanced_data, 'stack_depth': unbalanced_stack_depth, 'count': unbalanced_count}, columns=['sequence', 'stack_depth', 'count'])

In [67]:
bracket_data = pd.concat([balanced_df, unbalanced_df], ignore_index=True)

bracket_data = bracket_data.sample(frac=1).reset_index(drop=True)

In [68]:
bracket_data

Unnamed: 0,sequence,stack_depth,count
0,(()())())()))(()(()((())))))(((()))((((((())((...,2,13
1,())))))(()((()(((()(()((()()((()()))(())()))))...,1,-21
2,(()(()()()))()(())()()((()())()()(((()))(((()(...,17,0
3,(()((()))(()))()())()))((()()())))((()))()))((...,4,-21
4,(())(())()()(())((((())((())()()()((((()((((((...,29,0
...,...,...,...
35995,(())))(()))()())(((()())(()()((((())()((((((()...,2,45
35996,()()((()()())(())))))()))()())(((((())())(()((...,3,-24
35997,((()(()()((()(((()(())()()))((())(()))())()(((...,44,38
35998,()()))))(((((((()))(())())))(((((()()((())())(...,1,59


In [69]:
# find minimum stack depth
min_stack_depth = bracket_data['stack_depth'].min()

min_stack_depth

-217

In [70]:
max_stack_depth = bracket_data['stack_depth'].max()

max_stack_depth

99

In [71]:
bracket_data.to_csv('balanced_brackets.csv', index=False)