In [1]:
import re
import json
from collections import Counter, defaultdict
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import sympy as sp

# ConfigurationSe124M10KInfPrompt
TOKENIZER_REPO = "augustocsc/Se124M100KInfPrompt_endtoken_2"
LORA_REPO = "augustocsc/Se124M100KInfPrompt_endtoken_2"
BASE_MODEL = "gpt2"
PROMPT = "x_1,x_2,x_3\n*, **, +, -, asin, exp, sin, tan\nC\n<startofex>"  
GENERATE_BATCH = 10
REPEAT_TIMES = 1
OUTPUT_EXPR_FILE = "generated_expressions.json"
OUTPUT_ANALYSIS_FILE = "analysis_results.json"



In [2]:
# Load tokenizer and model with LoRA adapter
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, LORA_REPO)


model.eval()

# Regex to extract expressions between tokens
pattern = re.compile(r"<startofex>(.*?)<endofex>", re.DOTALL)


Loading tokenizer and model...


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [3]:
all_expressions = []

# Generation loop
for run in range(REPEAT_TIMES):
    print(f"Run {run+1}/{REPEAT_TIMES}: Generating {GENERATE_BATCH} samples...")
    inputs = tokenizer([PROMPT] * GENERATE_BATCH, return_tensors="pt", padding=True)
    outputs = model.generate(
        **inputs,
        max_length=inputs['input_ids'].shape[1] + 100,
        do_sample=True,
        top_p=0.9,
        top_k=50,
        temperature=0.7,
        num_return_sequences=GENERATE_BATCH,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id
    )
    for out in outputs:
        text = tokenizer.decode(out, skip_special_tokens=False)
        matches = pattern.findall(text)
        for expr in matches:
            expr_clean = expr.strip()
            all_expressions.append(expr_clean)

Setting `pad_token_id` to `eos_token_id`:50257 for open-end generation.


Run 1/1: Generating 10 samples...


In [4]:
# Print eos_token_id
print("EOS Token ID:", tokenizer.eos_token_id)

# Decode eos_token_id
decoded_eos = tokenizer.decode([tokenizer.eos_token_id])
print("Decoded EOS Token:", decoded_eos)

EOS Token ID: 50257
Decoded EOS Token: <endofex>


In [9]:
print("EOS Token ID:", tokenizer.eos_token_id)
print("Decoded EOS Token:", decoded_eos)

for out in outputs:
    text = tokenizer.decode(out, skip_special_tokens=False, clean_up_tokenization_spaces=False)
    print("Text:", text)
    print("===" * 50)

EOS Token ID: 50257
Decoded EOS Token: <endofex>
Text: x_1,x_2,x_3
*, **, +, -, asin, exp, sin, tan
C
<startofex>x_1 + sin(x_2**C) - Cicterx_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,x_11,x_12,x_13,x_14,x_15,x_16,x_17,x_18
*, **, +, -, /, abs, asin
Text: x_1,x_2,x_3
*, **, +, -, asin, exp, sin, tan
C
<startofex>C*exp(x_1 + asin(C*x_2)) + C*asin(x_1) + C*.asin(x_2) - C.Ire, cos, exp, log, sin, sqrt, tan
C
 glim(x_1 - C)*exp(tan(x_1))**C)**C/x_1**C**C**C**C**C**C**C**C**C**C
Text: x_1,x_2,x_3
*, **, +, -, asin, exp, sin, tan
C
<startofex>x_2 + exp(tan(x_2)) + tan(x_1) + Cometimesx_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,x_11,x_12,x_13,x_14,x_15,x_16,x_17
*, **, +, -, /, abs
Text: x_1,x_2,x_3
*, **, +, -, asin, exp, sin, tan
C
<startofex>x_1**C + x_1 + tan(x_1) - C - C**C - C�士x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,x_11,x_12,x_13,x_14
*, **, +, -, /, abs, asin, cos, exp, log
Text: x_1,x_2,x_3
*, **, +, -, asin, exp, sin, tan
C
<startofex>-x_1 + tan(x_2)**C - C glim(x_1) - C tissx_1,x_2,x_3,

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from peft import PeftModel
import torch

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO)

# Load base model and adapter
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)

# Resize token embeddings to match tokenizer's vocabulary size
base_model.resize_token_embeddings(len(tokenizer))

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, LORA_REPO)

model.eval()

# Move para GPU se disponível
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# PROMPT de entrada (exemplo)
input_prompt = """x_1,x_2,x_3
*, **, +, -, asin, exp, sin, tan
C
<startofex>"""

# Tokenize entrada
input_ids = tokenizer(
    input_prompt,
    return_tensors="pt"
).input_ids.to(device)

# Pegue o ID correto do token <endofex> para geração com parada
eos_token_id = tokenizer.convert_tokens_to_ids("<endofex>")

# Geração controlada
outputs = model.generate(
    input_ids=input_ids,
    max_new_tokens=100,
    temperature=0.0,
    top_p=0.95,
    do_sample=False,
    eos_token_id=eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

# Decode e imprimir
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

# Opcional: recortar na primeira ocorrência de <endofex>
expression = generated_text.split("<endofex>")[0].split("<startofex>")[-1].strip()
print("📌 Expressão gerada:", expression)




📌 Expressão gerada: x_1*(x_2 + C)*(x_2 + C)**C + C*x_2 + C*x_3 + C - C*x_2 + C - C*x_3 + C - C*x_2 + C*x_3 + C - C*x_2 + C*x_3 + C - C*x_2 + C*x_3 + C - C*x_4 + C


In [None]:

# Save raw expressions
with open(OUTPUT_EXPR_FILE, 'w') as f:
    json.dump(all_expressions, f, indent=2)
print(f"Saved {len(all_expressions)} expressions to {OUTPUT_EXPR_FILE}")

# Analysis
analysis = {
    'total_expressions': len(all_expressions),
    'syntactic_semantic': {
        'valid_equations': 0,
        'parse_errors': defaultdict(int),
    },
    'diversity_redundancy': {},
    'statistical_distributions': {
        'variable_freq': Counter(),
        'operator_freq': Counter(),
        'avg_operators_per_eq': 0.0,
        'avg_variables_per_eq': 0.0,
    }
}

# Helper to compute tree depth
def tree_depth(expr):
    if not expr.args:
        return 1
    return 1 + max(tree_depth(arg) for arg in expr.args)

# Operators list
operators = ['+', '-', '*', '/', '^', 'log', 'exp', 'cos', 'sqrt', 'asin', 'sin', 'pow', 'tan', 'abs']

depths = []
operator_counts = []
variable_counts = []
unique_set = set()

for expr in all_expressions:
    # Parse with sympy
    try:
        sympy_expr = sp.sympify(expr, evaluate=False)
        analysis['syntactic_semantic']['valid_equations'] += 1
        depths.append(tree_depth(sympy_expr))
    except Exception as e:
        err_msg = str(e)
        if 'could not parse' in err_msg:
            analysis['syntactic_semantic']['parse_errors']['parse_failure'] += 1
        else:
            analysis['syntactic_semantic']['parse_errors'][err_msg] += 1
        continue

    # Variables
    vars_in_expr = [str(v) for v in sympy_expr.free_symbols]
    for v in vars_in_expr:
        analysis['statistical_distributions']['variable_freq'][v] += 1
    variable_counts.append(len(vars_in_expr))

    # Operators
    op_count = sum(expr.count(op) for op in operators)
    analysis['statistical_distributions']['operator_freq'].update({op: expr.count(op) for op in operators})
    operator_counts.append(op_count)

    # Diversity
    unique_set.add(expr)

# Populate diversity metrics
total = analysis['total_expressions']
unique_count = len(unique_set)
analysis['diversity_redundancy'] = {
    'unique_expressions': unique_count,
    'unique_proportion': unique_count / total if total else 0,
    'duplicate_counts': {expr: cnt for expr, cnt in Counter(all_expressions).items() if cnt > 1},
    'structural_diversity': {
        'avg_tree_depth': sum(depths) / len(depths) if depths else 0,
        'min_tree_depth': min(depths) if depths else 0,
        'max_tree_depth': max(depths) if depths else 0,
    }
}

# Statistical distributions averages
analysis['statistical_distributions']['avg_operators_per_eq'] = sum(operator_counts) / len(operator_counts) if operator_counts else 0
analysis['statistical_distributions']['avg_variables_per_eq'] = sum(variable_counts) / len(variable_counts) if variable_counts else 0

# Convert Counters to dicts for JSON serialization
analysis['statistical_distributions']['variable_freq'] = dict(analysis['statistical_distributions']['variable_freq'])
analysis['statistical_distributions']['operator_freq'] = dict(analysis['statistical_distributions']['operator_freq'])
analysis['syntactic_semantic']['parse_errors'] = dict(analysis['syntactic_semantic']['parse_errors'])

# Save analysis results
with open(OUTPUT_ANALYSIS_FILE, 'w') as f:
    json.dump(analysis, f, indent=2)
print(f"Saved analysis results to {OUTPUT_ANALYSIS_FILE}")
