In [2]:
import random
from tqdm.notebook import tqdm
import multiprocessing
import pandas as pd

In [2]:
def generate_balanced_brackets(n):
    sequence = ""
    stack = []

    for i in range(n):
        open_or_close = random.choice([0, 1]) and stack
        left = n - i
        if left < len(stack) or (not open_or_close and len(stack) == left):
            open_or_close = 1
        if open_or_close:
            sequence += ")"
            stack.pop()
        else:
            sequence += "("
            stack.append("(")

    return sequence


def generate_balanced_brackets_with_stack_depth(n, stack_depth):
    sequence = ""
    stack = []

    # for i in range(min_depth):
    #     sequence += '('
    #     stack.append('(')
    start = random.randint(0, n - 2 * stack_depth)
    # print(start)

    for i in range(n):
        if i > start and i < start + stack_depth and stack_depth > len(stack):
            sequence += "("
            stack.append("(")
            continue
        open_or_close = random.choice([0, 1]) and stack
        left = n - i
        # print(left, n, i, min_depth)
        if (
            left < len(stack)
            or (not open_or_close and len(stack) == left)
            or len(stack) == stack_depth
        ):
            open_or_close = 1
        if open_or_close:
            sequence += ")"
            stack.pop()
        else:
            sequence += "("
            stack.append("(")

    return sequence


def is_balanced(s):
    stack = []
    max_depth = 0
    count = 0

    for bracket in s:
        if bracket == ")":
            if stack:
                stack.pop()
            else:
                stack.append(bracket)
                break
        else:
            stack.append(bracket)
            max_depth = max(max_depth, len(stack))

    for bracket in s:
        if bracket == "(":
            count += 1
        else:
            count -= 1

    if stack:
        return -1 * max_depth, count
    return max_depth, count


def generate_bracket_sequence(n):
    brackets = ["(", ")"]
    sequence = ""

    for i in range(n):
        sequence += random.choice(brackets)

    if is_balanced(sequence)[0] < 0:
        return sequence
    else:
        return generate_bracket_sequence(n)


def generate_bracket_sequence_with_stack_depth(n, depth):
    brackets = ["(", ")"]
    start = random.randint(0, n - depth)
    max_depth = 0
    stack = []

    sequence = "("
    for i in range(n - 1):
        if i > start and i < start + depth:
            sequence += "("
            stack.append("(")
            continue
        if len(stack) < depth:
            sequence += random.choice(brackets)
            stack.append(sequence[-1])
        else:
            sequence += ")"
            stack.pop()
        if sequence[-1] == "(":
            stack.append("(")
        max_depth = max(max_depth, len(stack))
    if is_balanced(sequence)[1] == 0:
        return sequence
    else:
        return generate_bracket_sequence_with_stack_depth(n, depth)

def generate_unbalanced_bracket_sequence_with_count(n, min_count, max_count):
    brackets = ["(", ")"]

    sequence = ""
    for _ in range(n):
        sequence += random.choice(brackets)
    bal = is_balanced(sequence)
    if bal[0] < 0:
        if abs(bal[1]) < min_count or abs(bal[1]) > max_count:
            return generate_unbalanced_bracket_sequence_with_count(n, min_count, max_count)
        print(bal[1])
        return sequence
    else:
        return generate_unbalanced_bracket_sequence_with_count(n, min_count, max_count)

In [3]:
generate_unbalanced_bracket_sequence_with_count(10, 2, 4)

-2


'(()))))()('

In [4]:
def generate_unbalanced_brackets(n):
    seq = generate_bracket_sequence(n)
    bal = is_balanced(seq)
    seq2 = seq
    while bal[0] > 0:
        print('yes')
        seq2 = seq
        index = random.randint(0, n-1)
        if seq[index] == '(':
            seq2 = seq[:index] + ')' + seq[index+1:]
        else:
            seq2 = seq[:index] + '(' + seq[index+1:]
        bal = is_balanced(seq2)
    return seq2

In [5]:
def generate_unbalanced_with_stack_depth(n, stack_depth):
    seq = generate_balanced_brackets_with_stack_depth(n, stack_depth)
    bal = is_balanced(seq)
    seq2 = seq
    while bal[0] != -stack_depth:
        seq2 = seq
        index = random.randint(0, n-1)
        if seq[index] == '(':
            seq2 = seq[:index] + ')' + seq[index+1:]
        else:
            seq2 = seq[:index] + '(' + seq[index+1:]
        bal = is_balanced(seq2)
        print(bal[0])
    return seq2

In [6]:
def generate_unbalanced_with_count(len, count, num_samples):
    gen_set = set()
    flag = 0
    open_no = 0
    close_no = 0
    for i in range(num_samples):
        if random.random() > 0.5:
            flag = 1
        if flag == 1:
            open_no = len / 2 + (count) / 2
            close_no = len / 2 - (count) / 2
        else:
            open_no = len / 2 - (count) / 2
            close_no = len / 2 + (count) / 2

        sequence = "(" * int(open_no) + ")" * int(close_no)
        # randomly permute

        sequence = "".join(random.sample(sequence, len(sequence)))
        while is_balanced(sequence)[0] > 0:
            sequence = "".join(random.sample(sequence, len(sequence)))
        if sequence not in gen_set:
            print(is_balanced(sequence)[1])
            gen_set.add(sequence)

    return list(gen_set)

In [7]:
generate_unbalanced_with_count(512, 2)

TypeError: generate_unbalanced_with_count() missing 1 required positional argument: 'num_samples'

In [5]:
generated_strings = pd.DataFrame(columns=['string', 'count'])

for i in range(0, 101, 2):
    generated_string = generate_unbalanced_with_count(512, i)
    for s in generated_string:
        generated_strings.loc[len(generated_strings)] = [s, is_balanced(s)[1]]
    
generated_strings.head()

Unnamed: 0,string,count
0,()()(())()((((()(((())()())()())(()))())((()()...,0
1,((((()(()))(()()()(())))()()(()((()()()))())))...,0
2,)()())())())))()((())())()(((((()())((((((((()...,0
3,(()()))()())())))((())(()))(())()()())(())))((...,0
4,)(()))((((())()))())((())()))()(((())))()())((...,0


In [6]:
generated_strings.to_csv('new_test_data.csv', index=False)

In [5]:
generate_unbalanced_brackets(10)

'(())))))()'

In [6]:
s = generate_balanced_brackets(100)
print(is_balanced(s), s, len(s))

(10, 0) ((()(()())((())(())(((())))(())(()))))((()()()(())))()(()(((((()))(((((()))(()))(())((()(()))))))))) 100


In [7]:
# Generating according to stack depth
balanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])

seqs = [0 for i in range(400)]

generated_seqs = set()

gen = [i for i in range(2, 10)] + [i for i in range(10, 101, 10)]
print(f'Generating sequences in the following stack depths: {gen}')

for j in tqdm(gen):
    while True:
        length = 512
        
        sequence = generate_balanced_brackets_with_stack_depth(length, j)
        balanced, count = is_balanced(sequence)
        
        if seqs[balanced] == 1000 or sequence in generated_seqs or balanced not in gen:
            continue
            
        seqs[balanced] += 1
        
        balanced_df.loc[len(balanced_df)] = [sequence, balanced, count]
        generated_seqs.add(sequence)
        
        if seqs[j] >= 1000:
            break
    # print(seqs)

Generating sequences in the following stack depths: [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]


  0%|          | 0/18 [00:00<?, ?it/s]

In [141]:
# Generating according to count
balanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])

seqs = [0 for i in range(400)]

generated_seqs = set()

# gen = [i for i in range(2, 10)] + [i for i in range(10, 101, 10)]
gen = [1]
print(f'Generating sequences in the following stack depths: {gen}')

for j in tqdm(gen):
    while True:
        length = 512
        
        depth = random.randint(2, 50)
        print(seqs[j], end='\r')
        sequence = generate_balanced_brackets(length)
        balanced, count = is_balanced(sequence)
        
        if seqs[j] == 2000 or sequence in generated_seqs:
            continue
            
        seqs[j] += 1
        
        balanced_df.loc[len(balanced_df)] = [sequence, balanced, count]
        generated_seqs.add(sequence)
        
        if seqs[j] >= 2000:
            break
    # print(seqs)

Generating sequences in the following stack depths: [1]


  0%|          | 0/1 [00:00<?, ?it/s]

1999

In [9]:
# Just generating random sequences
balanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])

generated_seqs = set()

for j in tqdm(range(20000)):
    length = 512
    sequence = generate_balanced_brackets(length)
    balanced, count = is_balanced(sequence)
    
    if sequence in generated_seqs:
        continue
        
    balanced_df.loc[len(balanced_df)] = [sequence, balanced, count]
    generated_seqs.add(sequence)

  0%|          | 0/20000 [00:00<?, ?it/s]

In [10]:
print(len(balanced_df))
balanced_df.head()

20000


Unnamed: 0,sequence,stack_depth,count
0,(())()()()(())((()))()()()()(((()())))(()(()))...,16,0
1,((()()))()(((())(((())(()()(()()))))))(((())((...,14,0
2,()()(())(((((()((())()()(())(()()((()))(())()(...,14,0
3,(())((()))()((()))()((()))()()((((()(()(()(())...,16,0
4,(())()()()()(()(()()(((()))(()(((()()())))((((...,11,0


In [19]:

unbalanced_data = []
for index, i in tqdm(enumerate(gen), total=len(gen)):
    while len(unbalanced_data) < 1000*(index+1):
        sequence = generate_unbalanced_with_stack_depth(512, i)
        balanced, count = is_balanced(sequence)
        # if balanced != -1*i:
            # print(i, end=' ')
            # continue
        unbalanced_data.append(sequence)
        
    # print(len(unbalanced_data))

print(len(unbalanced_data))
unbalanced_stack_depth = [ is_balanced(sequence) for sequence in unbalanced_data ]

unbalanced_count = [x[1] for x in unbalanced_stack_depth]
unbalanced_stack_depth = [x[0] for x in unbalanced_stack_depth]

unbalanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])
unbalanced_df['sequence'] = unbalanced_data
unbalanced_df['stack_depth'] = unbalanced_stack_depth
unbalanced_df['count'] = unbalanced_count

  0%|          | 0/18 [00:00<?, ?it/s]

-4
-4
-4
-4
-4
-4
-4
-2
-2
-2
-4
-2
-4
-4
-2
-2
-4
-2
-2
-2
-4
-2
-2
-4
-4
-4
-4
-4
-4
-4
-2
-4
-4
-2
-2
-2
-4
-4
-2
-2
-4
-4
-4
-2
-2
-2
-2
-4
-2
-1
-4
-4
-4
-4
-4
-2
-2
-4
-2
-4
-2
-4
-4
-2
-4
-4
-2
-4
-2
-2
-4
-2
-4
-4
-2
-4
-4
-4
-2
-2
-4
-2
-4
-4
-2
-2
-4
-4
-2
-2
-4
-4
-4
-4
-2
-2
-4
-4
-4
-4
-2
-4
-2
-4
-2
-4
-4
-2
-4
-2
-2
-2
-2
-2
-2
-4
-4
-4
-4
-2
-2
-2
-2
-2
-4
-4
-4
-4
-1
-4
-2
-4
-4
-4
-2
-2
-2
-4
-4
-4
-4
-4
-2
-4
-2
-4
-2
-2
-2
-4
-4
-2
-4
-2
-2
-2
-4
-2
-2
-4
-4
-2
-4
-2
-2
-4
-2
-2
-2
-2
-2
-4
-2
-4
-4
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-4
-2
-2
-4
-4
-4
-2
-4
-4
-4
-2
-2
-2
-4
-4
-4
-2
-4
-2
-4
-4
-4
-4
-4
-4
-4
-4
-2
-4
-4
-4
-2
-4
-4
-2
-4
-4
-4
-2
-2
-2
-4
-4
-2
-4
-4
-4
-2
-4
-1
-2
-2
-2
-2
-2
-2
-4
-2
-2
-2
-2
-4
-2
-2
-2
-4
-4
-4
-2
-2
-4
-4
-2
-2
-4
-4
-2
-4
-2
-4
-2
-4
-4
-2
-2
-2
-4
-2
-2
-4
-2
-2
-2
-4
-2
-4
-2
-2
-2
-4
-2
-4
-3
-2
-1
-2
-2
-4
-4
-2
-2
-4
-4
-4
-4
-2
-2
-4
-4
-4
-2
-4
-2
-4
-4
-4
-2
-2
-4
-4
-4
-2
-2
-4
-4
-2
-2
-2
-4
-4
-4
-4
-2
-2
-4
-2
-2
-4
-

In [144]:
# FOR COUNT DATA
unbalanced_data = []
for index, i in tqdm(enumerate(gen), total=len(gen)):
    while len(unbalanced_data) < 2000*(index+1):
        print(len(unbalanced_data), end='\r')
        sequence = generate_unbalanced_bracket_sequence_with_count(512, 5)
        balanced, count = is_balanced(sequence)
        # if balanced != -1*i:
            # print(i, end=' ')
            # continue
        unbalanced_data.append(sequence)
        
    # print(len(unbalanced_data))

print(len(unbalanced_data))
unbalanced_stack_depth = [ is_balanced(sequence) for sequence in unbalanced_data ]

unbalanced_count = [x[1] for x in unbalanced_stack_depth]
unbalanced_stack_depth = [x[0] for x in unbalanced_stack_depth]

  0%|          | 0/1 [00:00<?, ?it/s]

2
-4
0
-4
-4
-4
-2
-4
0
-4
-2
-4
-4
-4
-4
45
-2
27
08
09
20
-2
22
-2
-2
-4
46
47
48
-4
-4
01
42
43
24
25
-2
07
28
29
-4
01
02
43
44
-4
-4
07
-4
-2
-4
41
22
-2
-2
-4
46
-2
-2
29
-2
21
22
23
-2
-4
06
27
28
09
20
-2
02
43
-2
-2
-2
47
28
-4
20
21
02
-4
44
-2
-2
27
28
09
-4
-4
02
23
24
-4
26
-4
-2
-4
000
001
-42
003
204
-25
-46
407
208
-29
-40
411
-22
413
414
415
216
217
418
219
220
421
-22
-43
024
025
026
-27
-28
029
030
-41
-42
-23
-24
-45
036
037
-28
439
240
241
242
243
244
045
246
-47
-48
049
-20
-41
052
-43
254
255
056
057
-48
059
260
061
262
263
-24
065
-26
067
-28
-49
470
271
072
-23
474
475
076
277
278
279
480
481
082
483
084
-25
486
-27
288
-29
-40
491
-22
293
294
-25
496
297
-48
-29
-40
401
-22
403
-44
205
406
-27
-48
409
010
-41
012
-43
-44
-25
416
017
418
219
-40
421
222
-23
-44
225
026
027
-48
-29
430
231
-22
233
034
235
236
237
238
-49
040
441
042
043
044
-25
246
047
048
049
450
-41
052
453
-24
-25
-26
-27
058
259
-20
461
062
-23
464
265
-26
067
468
-29
-20
-41
-22
273
474
075

In [9]:
# Random data for CoT
unbalanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])

generated_seqs = set()

for i in tqdm(range(4000)):
    sequence = generate_unbalanced_brackets(64)
    balanced, count = is_balanced(sequence)
    
    if sequence in generated_seqs:
        continue
    
    unbalanced_df.loc[len(unbalanced_df)] = [sequence, balanced, count]
    generated_seqs.add(sequence)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [170]:

# unbalanced_df = pd.DataFrame({'sequence': unbalanced_data, 'stack_depth': unbalanced_stack_depth, 'count': unbalanced_count}, columns=['sequence', 'stack_depth', 'count'])

In [15]:
balanced_df.head()

Unnamed: 0,sequence,stack_depth,count
0,()()(())()(()())()(())(()())()(()()())()()()()...,2,0
1,(()())()(())()()(())()()(()())(()())()()()(())...,2,0
2,()()()(()())(())()()(())()()(())()(()())(()()(...,2,0
3,()(()()())(()())(()()()())()()()()()(()()())((...,2,0
4,(()())()()()()(()())(())(()()()()())()(()())((...,2,0


In [146]:
unbalanced_df.head()

Unnamed: 0,sequence,stack_depth,count
0,(()()()()())(())()(())(()()())()(()()())(())((...,-2,-2
1,()()(())(())(())(())()()()(()())(()())(())()()...,-2,-2
2,()()(()())()(())()(())()()(())(())(()()())(()(...,-2,-2
3,(()())(())(())()()()(())(()()()())()(()()())((...,-2,-2
4,()()()()()(()())()(())(()())(()()()())()(())((...,-2,-2


In [22]:
bracket_data = pd.concat([balanced_df, unbalanced_df], ignore_index=True)

bracket_data = bracket_data.sample(frac=1).reset_index(drop=True)

In [23]:
bracket_data

Unnamed: 0,sequence,stack_depth,count
0,((()()())())()(())()(()(()))()((()()(((()((()(...,-40,-2
1,()()()()()()(())()()((()((((()))()())()(()()))...,-8,2
2,((()()())()()())()(()(((()()()()()))))(((())))...,-5,-2
3,()((()(()()(()())))())(())(((()(()))(()((()))(...,-80,2
4,()(((()()()))((()()()()))((()())()()()))((()()...,-4,2
...,...,...,...
35995,()(())(())((()()(((())))()()))()(())()((()()()...,30,0
35996,()(()())()()(())(()())()(())()))(())(())(())()...,-2,-2
35997,(((()()))())(())()()()(())()()()()()()(()(()()...,-4,-2
35998,(())(())()((()()))(())()()()(())(()((()()())))...,-4,-2


In [3]:
bracket_data = pd.read_csv('Data/train-512.csv')

In [4]:
# find minimum stack depth
min_stack_depth = bracket_data['stack_depth'].min()

min_stack_depth

-82

In [5]:
max_stack_depth = bracket_data['stack_depth'].max()

max_stack_depth

84

In [26]:
bracket_data.to_csv('Data/stack-depth-train.csv', index=False)

In [20]:
def generate_no_count(n):
    sequence = ['(' for i in range(n//2)] + [')' for i in range(n//2)]
    sequence = ''.join(sequence)
    # print(sequence, is_balanced(sequence))
    while is_balanced(sequence)[0] > 0:
        # randomly shuffle the sequence
        sequence = ''.join(random.sample(sequence, len(sequence)))
    return sequence

In [48]:
s = generate_no_count(10)
is_balanced(s), s

((-1, 0), '())())()((')

In [104]:
no_count_data = []
for i in tqdm(range(4000)):
    sequence = generate_no_count(512)
    no_count_data.append(sequence)
    
no_count_stack_depth = [ is_balanced(sequence) for sequence in no_count_data ]

no_count_count = [x[1] for x in no_count_stack_depth]
no_count_stack_depth = [x[0] for x in no_count_stack_depth]

no_count_df = pd.DataFrame({'sequence': no_count_data, 'stack_depth': no_count_stack_depth, 'count': no_count_count}, columns=['sequence', 'stack_depth', 'count'])

  0%|          | 0/4000 [00:00<?, ?it/s]

In [105]:
no_count_df.head()

Unnamed: 0,sequence,stack_depth,count
0,))()((((()((((())())((())()((()()())(())))()((...,0,0
1,))(()))(()))(())(((()()((()))(())(()(())(()(((...,0,0
2,)(((()(((()())())((()()(((()()()))())(()(())((...,0,0
3,()))((()())(())))))))(((()(()))())))()(())((()...,-1,0
4,)((()))))()))))((())(()())(())((())()))()))(()...,0,0


In [102]:
no_count_df.to_csv('no_count.csv', index=False)

In [106]:
# append bracket data and no count data
combined_df = pd.concat([bracket_data, no_count_df], ignore_index=True)

combined_df = combined_df.sample(frac=1).reset_index(drop=True)

combined_df.to_csv('balanced_with_no_count.csv', index=False)

In [2]:
import pandas as pd

data = pd.read_csv('balanced_with_no_count.csv')
data2 = pd.read_csv('train-2.csv')

# concat data
data = pd.concat([data, data2], ignore_index=True)

data = data.sample(frac=1).reset_index(drop=True)

data.to_csv('train.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'train=2.csv'

In [5]:
df = pd.read_csv('Data/train-512.csv')
df2 = pd.read_csv('Data/count_min-0_max-0_per-count-20000_only-unbalanced_testing.csv')

df2 = df2.sample(n=4000).reset_index(drop=True)

# concat data
df = pd.concat([df, df2], ignore_index=True)

df = df.sample(frac=1).reset_index(drop=True)

df.to_csv('Data/train-512+4k.csv', index=False)