In [10]:
cfg_rules = {
    # Variables and digits
    "VARIABLE": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" ],
    "DIGIT": [str(i) for i in range(256)],

    # Operators
    "ARITHMETIC_OPERATOR": ["+", "-", "*", "/"],
    "RELATIONAL_OPERATOR": ["<", ">", "<=", ">=", "!=", "=="],
    "LOGICAL_OPERATOR_INFIX": ["and", "or"],
    "LOGICAL_OPERATOR_PREFIX": ["not"],
    "LOGICAL_OPERATOR": ["LOGICAL_OPERATOR_INFIX", "LOGICAL_OPERATOR_PREFIX"],
    "OPERATOR": ["ARITHMETIC_OPERATOR"],

    # Formatting
    "NEW_LINE": ["\n"],
    "TAB_INDENT": ["\t"],
    "BRACKET_OPEN": ['('],
    "BRACKET_CLOSE": [')'],
    "EQUALS": ["="],
    "COLON": [":"],
    "COMMA": [","],

    # Keywords
    "IF": ["if"],
    "ELIF": ["elif"],
    "ELSE": ["else"],
    "FOR": ["for"],
    "IN": ["in"],
    "RANGE": ["range"],
    "WHILE": ["while"],
    "PRINT": ["print"],

    # Initializations and assignments
    "IDENTIFIER_INITIALIZATION": ["IDENTIFIER_INITIALIZATION INITIALIZATION",
                                  "INITIALIZATION"],

    "INITIALIZATION": ["VARIABLE SPACE EQUALS SPACE DIGIT NEW_LINE"],
    
	"SIMPLE_ASSIGNMENT": ["VARIABLE SPACE EQUALS SPACE EXPRESSION NEW_LINE"],
    
	"ADVANCED_ASSIGNMENT": ["VARIABLE SPACE EQUALS SPACE SIMPLE_ARITHMETIC_EVALUATION NEW_LINE"],
    
	"SIMPLE_ARITHMETIC_EVALUATION": ["SIMPLE_ARITHMETIC_EVALUATION ARITHMETIC_OPERATOR ENCLOSED_EXPRESSION", 
                                     "ENCLOSED_EXPRESSION",
                                    ],
	
	# Terms and expressions
    "TERM": ["EXPRESSION_IDENTIFIER", "DIGIT"],
    "EXPRESSION": ["TERM SPACE OPERATOR SPACE TERM"],
    "ENCLOSED_EXPRESSION": ["BRACKET_OPEN EXPRESSION BRACKET_CLOSE"],
    "DISPLAY_EXPRESSION": ["EXPRESSION_IDENTIFIER SPACE OPERATOR SPACE EXPRESSION_IDENTIFIER",
                            "EXPRESSION_IDENTIFIER SPACE OPERATOR SPACE DIGIT"],

    # Conditions
    "SIMPLE_IF_STATEMENT": ["IF SPACE CONDITION SPACE COLON NEW_LINE"],
    "ADVANCED_IF_STATEMENT": ["IF SPACE CHAIN_CONDITION SPACE COLON NEW_LINE"],
    "SIMPLE_ELIF_STATEMENT": ["ELIF SPACE CONDITION SPACE COLON NEW_LINE"],
    "ADVANCED_ELIF_STATEMENT": ["ELIF SPACE CHAIN_CONDITION SPACE COLON NEW_LINE"],
    "ELSE_STATEMENT": ["ELSE SPACE COLON NEW_LINE"],

    "CHAIN_CONDITION": ["CHAIN_CONDITION SPACE LOGICAL_OPERATOR_INFIX SPACE ENCLOSED_CONDITION", 
                        "LOGICAL_OPERATOR_PREFIX SPACE ENCLOSED_CONDITION", 
                        "ENCLOSED_CONDITION"],
    "ENCLOSED_CONDITION": ["BRACKET_OPEN CONDITION BRACKET_CLOSE"],
    "CONDITION": ["OPTIONAL_NOT CONDITION_EXPRESSION", "CONDITION_EXPRESSION"],
    "CONDITION_EXPRESSION": ["EXPRESSION_IDENTIFIER SPACE RELATIONAL_OPERATOR SPACE EXPRESSION_IDENTIFIER", 
                                "EXPRESSION_IDENTIFIER SPACE RELATIONAL_OPERATOR SPACE DIGIT"],
    "OPTIONAL_NOT": ["LOGICAL_OPERATOR_PREFIX SPACE", "SPACE"], 

    # Loops
    "FOR_HEADER": ["FOR SPACE EXPRESSION_IDENTIFIER SPACE IN SPACE RANGE BRACKET_OPEN INITIAL COMMA SPACE FINAL COMMA SPACE STEP BRACKET_CLOSE SPACE COLON NEW_LINE", 
                    "FOR SPACE EXPRESSION_IDENTIFIER SPACE IN SPACE RANGE BRACKET_OPEN INITIAL COMMA SPACE FINAL BRACKET_CLOSE SPACE COLON NEW_LINE"],
    "INITIAL": ["DIGIT"],

    "FOR_LOOP": ["FOR_HEADER NEW_LINE TAB_INDENT DISPLAY"],
    "ADVANCED_FOR_LOOP": ["FOR_LOOP",
						  "FOR_HEADER NEW_LINE TAB_INDENT ADVANCED_DISPLAY"],
	
	# Displaying 
	"DISPLAY" : ["PRINT BRACKET_OPEN DISPLAY_IDENTIFIER BRACKET_CLOSE NEW_LINE"],
	"ADVANCED_DISPLAY" : ["DISPLAY",
					   	  "PRINT BRACKET_OPEN DISPLAY_EXPRESSION BRACKET_CLOSE NEW_LINE"],
}

In [4]:
import random

In [23]:
def generate_code(symbol, assigned_identifiers, last_variable, for_init_step)->str:
	"""
	Generate code recursively based on the context-free grammar rules.

	Parameters:
	- symbol (str): The symbol to generate code for.
	- assigned_identifiers (dict): Dictionary of assigned identifiers and their values.
	- last_variable (set): Set of the last used variables.
	- parent (Node): Parent node in the syntax tree.

	Returns:
	- str: The generated code.
	"""
	#node = Node(symbol, parent=parent)
	global init_count
	global max_init
	# If the symbol is a non-terminal <--> it's a production rule (PR)
	if symbol in cfg_rules:
		# We check if the PR is an IDENTIFIER_INITIALIZATION in which case we check if we didn't go past max_init initializations
		if symbol == "IDENTIFIER_INITIALIZATION":
			if init_count < max_init:
				init_count += 1
			else:
				symbol = "INITIALIZATION"
		# We developp the PR
		rule = random.choice(cfg_rules[symbol])
		symbols = rule.split(" ")
		# We call the generate code function to get the string associated with this PR
		generated_symbols = [generate_code(s, assigned_identifiers, last_variable, for_init_step) for s in symbols]
		# If it's an INITIAL=>DIGIT PR , we record the DIGIT=>0..255 value in the for_init_step dictionary (will be used when calculating the FINAL of the for loop)
		if symbol == "INITIAL":
			init = generated_symbols[0]
			for_init_step["initial_value"] = init
		# Elif it's an INITIALIZATION PR, we record the generated VARIABLE and it's DIGIT value in the assigned_identifiers dictionary
		elif symbol == "INITIALIZATION":
			variable_name = generated_symbols[0]
			variable_value = generated_symbols[4]  
			assigned_identifiers[variable_name] = variable_value
		# Elif it's a SIMPLE/ADVANCED_ASSIGNMENTS PR, we record the generated VARIABLE in the last_variable set (for it to be printed later ...)
		elif (symbol == "SIMPLE_ASSIGNMENTS") or (symbol == "ADVANCED_ASSIGNMENTS"):
			# We check if the SIMPLE/ADVANCED_ASSIGNMENTS PR didn't develop to "" (in which case it's just as if didn't exist ...)
			if generated_symbols[0]:
				last_variable.add(generated_symbols[0])
		# Concatenate the generated_sub_codes and return the resulting sub_code
		return ''.join(generated_symbols)

	# Else the symbol is a (meta-)terminal, a terminal being one that is returned as is (the simplest case), and a meta-terminal must be generated based on past generations   
	# If EXPRESSION_IDENTIFIER (like we find in ASSIGNEMENTS, DISPLAYS, and FOR loops), we choose randomly among one of the previously initialized variables
	# NOTE: FOR loops don't require the control variable to be initialized -> this could be a point of generalization
	if symbol == "EXPRESSION_IDENTIFIER":
		identifier = random.choice(tuple(assigned_identifiers.keys())) if assigned_identifiers else random.choice(cfg_rules["DIGIT"])
		return identifier
	# If EXPRESSION_IDENTIFIER_WHILE (i.e. "the declaration" of the control variable of the while loop)
	# NOTE: this one contrary to for loop ... must be one of the existing initialized variables
	if symbol == "EXPRESSION_IDENTIFIER_WHILE":
		initial_var = random.choice(tuple(assigned_identifiers.keys())) if assigned_identifiers else random.choice(cfg_rules["DIGIT"])
		for_init_step["initial_var"] = initial_var
		for_init_step["initial_value"] = assigned_identifiers[initial_var]
		return initial_var    
	# If WHILE_IDENTIFIER (i.e. the "update" of the control variable of the while loop), get it from the for_init_step dictionary (filled by the EXPRESSION_IDENTIFIER_WHILE meta-terminal)
	if symbol == "WHILE_IDENTIFIER":
		return for_init_step.get("initial_var", "*")
	# If the symbol is a FINAL (for the for loop) or FINAL_LESS (for the while <= loop), choose a step and number of executions, compute the FINAL/_LESS using the for_init_step dict, and record the setp for the for loop as it will be needed later to fill the STEP meta-terminal
	if (symbol == "FINAL") or (symbol == "FINAL_LESS"):    
		initial_value = for_init_step.get("initial_value", "0")
		# Generate valid step_value and execution_count
		valid_values = [(1, 2), (2, 1), (2, 2), (2, 3), (3, 2)]
		step_value, execution_count = random.choice(valid_values)
		for_init_step["step"] = str(step_value)
		final_value = step_value * execution_count + int(initial_value) - 1
		return str(final_value)
	# Same thing as for the one before but this one is only meant for the while loop
	if symbol == "FINAL_GREATER":
		initial_value = for_init_step.get("initial_value", "0")
		# Generate valid step_value and execution_count
		valid_values = [(1, 2), (2, 1), (2, 2), (2, 3), (3, 2)]
		step_value, execution_count = random.choice(valid_values)
		for_init_step["step"] = str(step_value)
		final_value = int(initial_value) - step_value * execution_count + 1
		return str(final_value)
	# If the STEP meta variable, fill it with the for_init_step dict  
	if symbol == "STEP":
		return for_init_step.get("step", "0")
	# If DISPLAY_IDENTIFIER, fill it with either the last variable (if there was an ASSIGNEMENTS), or any randomly chosen variable 
	if symbol == "DISPLAY_IDENTIFIER":
		try:
			return f"{tuple(last_variable)[0]}"
		except Exception:
			try:
				return f"{random.choice(tuple(assigned_identifiers.keys()))}"
			except Exception:
				return random.choice(cfg_rules["DIGIT"])
	# If non of the above i.e. its a terminal (not a meta-terminal)
	return symbol

In [76]:
pattern = """
IDENTIFIER_INITIALIZATION
DISPLAY
SIMPLE_IF_STATEMENT
	SIMPLE_ASSIGNMENT
	FOR_HEADER
		IDENTIFIER_INITIALIZATION
		ADVANCED_ASSIGNMENT
		ADVANCED_DISPLAY
SIMPLE_ELIF_STATEMENT
	SIMPLE_IF_STATEMENT
		ADVANCED_ASSIGNMENT
	ELSE_STATEMENT:
		DISPLAY
"""

pattern_vocabulary = {
    "IDENTIFIER_INITIALIZATION",
    "SIMPLE_ASSIGNMENT",
    "ADVANCED_ASSIGNMENT",
    "SIMPLE_IF_STATEMENT",
    "SIMPLE_ELIF_STATEMENT",
    "ELSE_STATEMENT",
    "WHILE_HEADER",
    "FOR_HEADER",
	"DISPLAY",
	"ADVANCED_DISPLAY"
}

pattern_vocab_for_regex = "|".join(pattern_vocabulary)
pattern_vocab_for_regex

'ADVANCED_DISPLAY|IDENTIFIER_INITIALIZATION|WHILE_HEADER|FOR_HEADER|SIMPLE_ASSIGNMENT|ADVANCED_ASSIGNMENT|SIMPLE_ELIF_STATEMENT|ELSE_STATEMENT|DISPLAY|SIMPLE_IF_STATEMENT'

In [83]:
import re
from tqdm import tqdm
re_pattern_line_parser = re.compile(f"(\t*)({pattern_vocab_for_regex})")
re_general_line_finder = re.compile(".+(?:\n|$)")
file = open('data.txt', "w")
for i in tqdm(range(1)):
	init_count = 0
	max_init = 3
	code = ""
	available_identifiers = dict()
	last_variable = set()
	for_init_step = dict()
	available_identifiers_stack = [dict()] # each level of the stack is a dictionary containing the available identifiers and their values
	# Decompose in pattern lines
	pattern_lines = pattern.strip().split("\n")
	# Loop through the pattern lines and generate their codes
	for pattern_line in pattern_lines:
		pattern_line_match = re_pattern_line_parser.match(pattern_line)
		new_indent_level = len(pattern_line_match.group(1))
		current_indent_level = len(available_identifiers_stack)-1
		if new_indent_level > current_indent_level:
			available_identifiers_stack.append(dict(available_identifiers_stack[-1]))
		elif new_indent_level < current_indent_level:
			available_identifiers_stack.pop()
		available_identifiers = available_identifiers_stack[-1]
		keyword = pattern_line_match.group(2)
		keyword_gen_code = generate_code(keyword, available_identifiers, last_variable, for_init_step).replace("SPACE", " ")
		# maybe here check if the keyword is a while header, we could remove the chosen control variable from the available identifiers of this indent level and onwards ...
		code_line = pattern_line_match.group(1) + pattern_line_match.group(1).join(re_general_line_finder.findall(keyword_gen_code))
		code += code_line
	try:
		exec(code,{})
	except ZeroDivisionError:
		pass
	file.write(code+"\n\n")
file.close()

100%|██████████| 1/1 [00:00<?, ?it/s]

166
122





In [None]:
import code_generator
cg = code_generator.CodeGenerator()
cg.generate_and_write_programs(1000000, "ALL")