In [1]:
import random
from tqdm.notebook import tqdm
import multiprocessing
import pandas as pd

In [2]:
def generate_balanced_brackets(n):
    sequence = ''
    stack = []
    
    for i in range(n):
        open_or_close = random.choice([0, 1]) and stack
        left = n - i
        if left < len(stack) or (not open_or_close and len(stack) == left):
            open_or_close = 1
        if open_or_close:
            sequence += ')'
            stack.pop()
        else:
            sequence += '('
            stack.append('(')
    
    return sequence

def is_balanced(s):
    stack = []
    max_depth = 0
    count = 0
    
    for bracket in s:
        if bracket == ')':
            if stack:
                stack.pop()
            else:
                stack.append(bracket)
                break
        else:
            stack.append(bracket)
            max_depth = max(max_depth, len(stack))
    
    for bracket in s:
        if bracket == '(':
            count += 1
        else:
            count -= 1
                
    if stack:
        return -1 * max_depth, count
    return max_depth, count

def generate_bracket_sequence(n):
    brackets = ['(', ')']
    sequence = ''
    
    for i in range(n):
        sequence += random.choice(brackets)
    
    if is_balanced(sequence)[0] <= 0:
        return sequence
    else:
        return generate_bracket_sequence(n)
    
def generate_bracket_sequence_2(n, min_count):
    brackets = ['(', ')']
    
    sequence = ''
    for _ in range(n):
        sequence += random.choice(brackets)
    bal = is_balanced(sequence)
    if bal[0] < 0:
        if abs(bal[1]) >= min_count:
            return generate_bracket_sequence_2(n, min_count)
        return sequence
    else:
        return generate_bracket_sequence_2(n, min_count)

In [3]:
def generate_unbalanced_brackets(n):
    seq = generate_bracket_sequence(n)
    bal = is_balanced(seq)
    seq2 = seq
    while bal[0] > 0:
        print('yes')
        seq2 = seq
        index = random.randint(0, n-1)
        if seq[index] == '(':
            seq2 = seq[:index] + ')' + seq[index+1:]
        else:
            seq2 = seq[:index] + '(' + seq[index+1:]
        bal = is_balanced(seq2)
    return seq2

In [4]:
def generate_unbalanced_with_count(n, count):
    brackets = ['(', ')']
    gen_set = set()
    flag = 0
    open_no = 0
    close_no = 0
    for i in range(200):
        if random.random() > 0.5:
            flag = 1
        if flag == 1:
            open_no = n/2 + (count)/2
            close_no = n/2 - (count)/2
        else:
            open_no = n/2 - (count)/2
            close_no = n/2 + (count)/2
        
        sequence = '(' * int(open_no) + ')' * int(close_no)
        # randomly permute
        
        sequence = ''.join(random.sample(sequence, len(sequence)))
        while is_balanced(sequence)[0] > 0:
            sequence = ''.join(random.sample(sequence, len(sequence)))
        if sequence not in gen_set:
            gen_set.add(sequence)
    
    return list(gen_set)

In [5]:
generated_strings = pd.DataFrame(columns=['string', 'count'])

for i in range(0, 101, 2):
    generated_string = generate_unbalanced_with_count(512, i)
    for s in generated_string:
        generated_strings.loc[len(generated_strings)] = [s, is_balanced(s)[1]]
    
generated_strings.head()

Unnamed: 0,string,count
0,()()(())()((((()(((())()())()())(()))())((()()...,0
1,((((()(()))(()()()(())))()()(()((()()()))())))...,0
2,)()())())())))()((())())()(((((()())((((((((()...,0
3,(()()))()())())))((())(()))(())()()())(())))((...,0
4,)(()))((((())()))())((())()))()(((())))()())((...,0


In [6]:
generated_strings.to_csv('new_test_data.csv', index=False)

In [5]:
generate_unbalanced_brackets(10)

'(())))))()'

In [6]:
s = generate_balanced_brackets(100)
print(is_balanced(s), s, len(s))

(10, 0) ((()(()())((())(())(((())))(())(()))))((()()()(())))()(()(((((()))(((((()))(()))(())((()(()))))))))) 100


In [None]:
# Generating according to stack depth
balanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])

seqs = [0 for i in range(400)]

generated_seqs = set()

gen = [i for i in range(2, 10)] + [i for i in range(10, 101, 10)]
print(f'Generating sequences in the following stack depths: {gen}')

for j in tqdm(gen):
    while True:
        length = 512
        
        sequence = generate_balanced_brackets(length, j, j)
        balanced, count = is_balanced(sequence)
        
        if seqs[balanced] == 1000 or sequence in generated_seqs or balanced not in gen:
            continue
            
        seqs[balanced] += 1
        
        balanced_df.loc[len(balanced_df)] = [sequence, balanced, count]
        generated_seqs.add(sequence)
        
        if seqs[j] >= 1000:
            break
    # print(seqs)

In [None]:
# Generating according to count
balanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])

seqs = [0 for i in range(400)]

generated_seqs = set()

# gen = [i for i in range(2, 10)] + [i for i in range(10, 101, 10)]
gen = [1]
print(f'Generating sequences in the following stack depths: {gen}')

for j in tqdm(gen):
    while True:
        length = 512
        
        depth = random.randint(2, 50)
        print(seqs[j], end='\r')
        sequence = generate_balanced_brackets(length, depth, depth)
        balanced, count = is_balanced(sequence)
        
        if seqs[j] == 2000 or sequence in generated_seqs:
            continue
            
        seqs[j] += 1
        
        balanced_df.loc[len(balanced_df)] = [sequence, balanced, count]
        generated_seqs.add(sequence)
        
        if seqs[j] >= 2000:
            break
    # print(seqs)

In [7]:
# Just generating random sequences
balanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])

generated_seqs = set()

for j in tqdm(range(4000)):
    length = 64
    sequence = generate_balanced_brackets(length)
    balanced, count = is_balanced(sequence)
    
    if sequence in generated_seqs:
        continue
        
    balanced_df.loc[len(balanced_df)] = [sequence, balanced, count]
    generated_seqs.add(sequence)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [8]:
print(len(balanced_df))
balanced_df.head()

4000


Unnamed: 0,sequence,stack_depth,count
0,()()()(()(())(()(()()(())()))())()(())()()((()...,5,0
1,()(()(((()()())))((()()()(((((()((((((())(((((...,18,0
2,()()()()((()()()((()))()(())())(()())(())((()(...,5,0
3,()((((()((()((())()())())))((((((())())()()))(...,10,0
4,(())()((())()((()(((())((()))()))()()))(((()))...,8,0


In [120]:

unbalanced_data = []
for index, i in tqdm(enumerate(gen), total=len(gen)):
    while len(unbalanced_data) < 1000*(index+1):
        sequence = generate_bracket_sequence(512, i)
        balanced, count = is_balanced(sequence)
        # if balanced != -1*i:
            # print(i, end=' ')
            # continue
        unbalanced_data.append(sequence)
        
    # print(len(unbalanced_data))

print(len(unbalanced_data))
unbalanced_stack_depth = [ is_balanced(sequence) for sequence in unbalanced_data ]

unbalanced_count = [x[1] for x in unbalanced_stack_depth]
unbalanced_stack_depth = [x[0] for x in unbalanced_stack_depth]

  0%|          | 0/1 [00:00<?, ?it/s]

1000


In [168]:
# FOR COUNT DATA
unbalanced_data = []
for index, i in tqdm(enumerate(gen), total=len(gen)):
    while len(unbalanced_data) < 2000*(index+1):
        print(len(unbalanced_data), end='\r')
        sequence = generate_bracket_sequence_2(512, 5)
        balanced, count = is_balanced(sequence)
        # if balanced != -1*i:
            # print(i, end=' ')
            # continue
        unbalanced_data.append(sequence)
        
    # print(len(unbalanced_data))

print(len(unbalanced_data))
unbalanced_stack_depth = [ is_balanced(sequence) for sequence in unbalanced_data ]

unbalanced_count = [x[1] for x in unbalanced_stack_depth]
unbalanced_stack_depth = [x[0] for x in unbalanced_stack_depth]

  0%|          | 0/1 [00:00<?, ?it/s]

2000


In [9]:
# Random data for CoT
unbalanced_df = pd.DataFrame(columns=['sequence', 'stack_depth', 'count'])

generated_seqs = set()

for i in tqdm(range(4000)):
    sequence = generate_unbalanced_brackets(64)
    balanced, count = is_balanced(sequence)
    
    if sequence in generated_seqs:
        continue
    
    unbalanced_df.loc[len(unbalanced_df)] = [sequence, balanced, count]
    generated_seqs.add(sequence)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [170]:

# unbalanced_df = pd.DataFrame({'sequence': unbalanced_data, 'stack_depth': unbalanced_stack_depth, 'count': unbalanced_count}, columns=['sequence', 'stack_depth', 'count'])

In [10]:
unbalanced_df.head()

Unnamed: 0,sequence,stack_depth,count
0,(()((()())((((()()()(())))(()()((())(((())()))...,-10,4
1,((()))(()(()))))))((())(())))))()(()(()(((((()...,-3,-6
2,)(((()))()(((()))(())))))(()((()())()))))(())(...,0,-2
3,())()))(()())))())))((()(()(((((())(()())))(()...,-1,0
4,((()(()(())))())(())())((()))))))())((((()()))...,-5,0


In [11]:
bracket_data = pd.concat([balanced_df, unbalanced_df], ignore_index=True)

bracket_data = bracket_data.sample(frac=1).reset_index(drop=True)

In [12]:
bracket_data

Unnamed: 0,sequence,stack_depth,count
0,()(())()()((()))(())(())()(())()((()())((()(((...,9,0
1,))()())(()))((()()())))()())))())((()(()()((((...,0,-6
2,)))))()))()))(((()())()((()))((((()(()(())(())...,0,0
3,()(())()()()()()()()(((()(()(((()())((((()()((...,12,0
4,(()((()())(()(((())))(((()()()()(()()()))))))(...,8,0
...,...,...,...
7995,)(())))((((()(()()((((())(())((())((())()))(((...,0,6
7996,()()((()))()((()(()()))())()(())(((())((((((()...,12,0
7997,))))(())(()())()))))()((()(()())()))(()()))(()...,0,-6
7998,(((()()(()(()(((()))))(()))()()(())(()(()())))...,9,0


In [13]:
# find minimum stack depth
min_stack_depth = bracket_data['stack_depth'].min()

min_stack_depth

-27

In [14]:
max_stack_depth = bracket_data['stack_depth'].max()

max_stack_depth

27

In [15]:
bracket_data.to_csv('Data/train-CoT-8k.csv', index=False)

In [20]:
def generate_no_count(n):
    sequence = ['(' for i in range(n//2)] + [')' for i in range(n//2)]
    sequence = ''.join(sequence)
    # print(sequence, is_balanced(sequence))
    while is_balanced(sequence)[0] > 0:
        # randomly shuffle the sequence
        sequence = ''.join(random.sample(sequence, len(sequence)))
    return sequence

In [48]:
s = generate_no_count(10)
is_balanced(s), s

((-1, 0), '())())()((')

In [104]:
no_count_data = []
for i in tqdm(range(4000)):
    sequence = generate_no_count(512)
    no_count_data.append(sequence)
    
no_count_stack_depth = [ is_balanced(sequence) for sequence in no_count_data ]

no_count_count = [x[1] for x in no_count_stack_depth]
no_count_stack_depth = [x[0] for x in no_count_stack_depth]

no_count_df = pd.DataFrame({'sequence': no_count_data, 'stack_depth': no_count_stack_depth, 'count': no_count_count}, columns=['sequence', 'stack_depth', 'count'])

  0%|          | 0/4000 [00:00<?, ?it/s]

In [105]:
no_count_df.head()

Unnamed: 0,sequence,stack_depth,count
0,))()((((()((((())())((())()((()()())(())))()((...,0,0
1,))(()))(()))(())(((()()((()))(())(()(())(()(((...,0,0
2,)(((()(((()())())((()()(((()()()))())(()(())((...,0,0
3,()))((()())(())))))))(((()(()))())))()(())((()...,-1,0
4,)((()))))()))))((())(()())(())((())()))()))(()...,0,0


In [102]:
no_count_df.to_csv('no_count.csv', index=False)

In [106]:
# append bracket data and no count data
combined_df = pd.concat([bracket_data, no_count_df], ignore_index=True)

combined_df = combined_df.sample(frac=1).reset_index(drop=True)

combined_df.to_csv('balanced_with_no_count.csv', index=False)

In [2]:
import pandas as pd

data = pd.read_csv('balanced_with_no_count.csv')
data2 = pd.read_csv('train-2.csv')

# concat data
data = pd.concat([data, data2], ignore_index=True)

data = data.sample(frac=1).reset_index(drop=True)

data.to_csv('train.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'train=2.csv'

In [18]:
df = pd.read_csv('Data/train-CoT.csv')

# take 1000 samples
df = df.sample(n=1000)
df.to_csv('Data/train-CoT-small.csv', index=False)