<a href="https://colab.research.google.com/github/carolynw898/STAT946Proj/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from utils import lossFunc, relativeErr
from models import SymbolicDiffusion, PointNetConfig, SymbolicFlowMatching
import torch
from torch.utils.data import DataLoader

In [2]:
n_embd = 128
timesteps = 1000
batch_size = 1
learning_rate = 1e-4
num_epochs = 10
blockSize = 32
testBlockSize = 400
numVars = 1
numYs = 1
numPoints = 250
target = 'Skeleton'
const_range = [-2.1, 2.1]
trainRange = [-3.0, 3.0]
decimals = 8
addVars = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import numpy as np
import glob
from utils import processDataFiles, CharDataset
import random

files = glob.glob("1Var_RandSupport_FixedLength_-3to3_-5.0to-3.0-3.0to5.0_30Points/Train/0_1_0_14062021_193012.json")
text = processDataFiles(files)
chars = sorted(list(set(text))+['_','T','<','>',':']) # extract unique characters from the text before converting the text to a list, # T is for the test data
text = text.split('\n') # convert the raw text to a set of examples
trainText = text[:-1] if len(text[-1]) == 0 else text
random.shuffle(trainText) # shuffle the dataset, it's important specailly for the combined number of variables experiment
train_dataset = CharDataset(trainText, blockSize, chars, numVars=numVars,
                        numYs=numYs, numPoints=numPoints, target=target, addVars=addVars,
                        const_range=const_range, xRange=trainRange, decimals=decimals)

data has 498795 examples, 49 unique.


In [4]:
idx = np.random.randint(train_dataset.__len__())
inputs, outputs, points, variables = train_dataset.__getitem__(idx)
inputs = ''.join([train_dataset.itos[int(i)] for i in inputs])
outputs = ''.join([train_dataset.itos[int(i)] for i in outputs])
print('id:{}\noutputs:{}\nvariables:{}'.format(idx,outputs,variables))

id:411307
outputs:C*cos(C*sin(C*cos(C*x1)))+C>____
variables:1


In [5]:
files = glob.glob("1Var_RandSupport_FixedLength_-3to3_-5.0to-3.0-3.0to5.0_30Points/Val/0_1_0_13062021_173950.json")
textTest = processDataFiles(files)
textTest = textTest.split('\n') # convert the raw text to a set of examples
# test_dataset_target = CharDataset(textTest, blockSize, chars, target=target)
test_dataset = CharDataset(textTest, testBlockSize, chars, numVars=numVars,
                        numYs=numYs, numPoints=numPoints, addVars=addVars, target=target,
                        const_range=const_range, xRange=trainRange, decimals=decimals)

idx = np.random.randint(test_dataset.__len__())
inputs, outputs, points, variables = test_dataset.__getitem__(idx)
print(points.min(), points.max())
inputs = ''.join([train_dataset.itos[int(i)] for i in inputs])
outputs = ''.join([train_dataset.itos[int(i)] for i in outputs])
print('id:{}\noutputs:{}\nvariables:{}'.format(idx,outputs,variables))

test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        pin_memory=True,
        shuffle=False,
        num_workers=2
    )

data has 972 examples, 49 unique.
tensor(-2.9879) tensor(2.6331)
id:15
outputs:C*log(C*exp(C*x1))+C>___________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________
variables:1


In [6]:
import torch
import json
from scipy.optimize import minimize
from math import log

@torch.no_grad()
def test_model(model, test_loader, textTest, train_dataset, device):
    results = {'target': [], 'predicted': [], 'error': []}

    for batch_idx, (_, tokens, points, variables) in enumerate(test_loader):
        points = points.to(device)    # [B, 2, 250]
        tokens = tokens.to(device)    # [B, L]
        variables = variables.to(device)  # [B]

        generated_tokens = model.sample(points, variables, device)

        true_embeds = model.tok_emb(tokens)
        print("True embeddings: ", torch.quantile(true_embeds, torch.tensor([0.1, 0.25, 0.5, 0.75, 0.9], device=true_embeds.device)))
        
        #for embed in embeds:
        #    print(torch.quantile(embed, torch.tensor([0.1, 0.25, 0.5, 0.75, 0.9], device=embed.device)))

        for i in range(batch_size):
            # Ground truth
            eq = ''.join([train_dataset.itos[int(i)] for i in tokens[0]])
            eq = eq.strip(train_dataset.paddingToken).split('>')
            eq = eq[0] #if len(eq[0])>=1 else eq[1]
            eq = eq.strip('<').strip(">")

            # Predicted
            predicted_tokens = generated_tokens[i].cpu().numpy()
            predicted = ''.join([train_dataset.itos[int(idx)] for idx in predicted_tokens])
            predicted = predicted.strip(train_dataset.paddingToken).split('>')
            predicted = predicted[0] if len(predicted[0]) >= 1 else predicted[1]
            predicted = predicted.strip('<').strip(">")
            predicted = predicted.replace('Ce', 'C*e')

            # train a regressor to find the constants (too slow)
            c = [1.0 for i,x in enumerate(predicted) if x=='C'] # initialize coefficients as 1
            # c[-1] = 0 # initialize the constant as zero
            b = [(-2,2) for i,x in enumerate(predicted) if x=='C']  # bounds on variables
            try:
                if len(c) != 0:
                    # This is the bottleneck in our algorithm
                    # for easier comparison, we are using minimize package
                    cHat = minimize(lossFunc, c, #bounds=b,
                                args=(predicted, t['X'], t['Y']))

                    predicted = predicted.replace('C','{}').format(*cHat.x)
            except ValueError:
                raise 'Err: Wrong Equation {}'.format(predicted)
            except Exception as e:
                raise 'Err: Wrong Equation {}, Err: {}'.format(predicted, e)

            t = json.loads(textTest[i])
            Ys = [] #t['YT']
            Yhats = []
            for xs in t['XT']:
              try:
                  eqTmp = eq + ''  # copy eq
                  eqTmp = eqTmp.replace(' ', '')
                  eqTmp = eqTmp.replace('\n', '')
                  for i, x in enumerate(xs):
                      # replace xi with the value in the eq
                      eqTmp = eqTmp.replace('x{}'.format(i + 1), str(x))
                      if ',' in eqTmp:
                          assert 'There is a "," in the equation!'
                  YEval = eval(eqTmp)
              except Exception as e:
                  continue
                  print(f"TA: Invalid equation encountered. Eq: {eqTmp}, Reason: {e}")
                  YEval = 100  # Assign a default value or handle as needed
                  #raise
                  YEval = 100
              Ys.append(YEval)

              try:
                  eqTmp = predicted + ''  # copy eq
                  eqTmp = eqTmp.replace(' ', '')
                  eqTmp = eqTmp.replace('\n', '')
                  for i, x in enumerate(xs):
                      # replace xi with the value in the eq
                      eqTmp = eqTmp.replace('x{}'.format(i + 1), str(x))
                      if ',' in eqTmp:
                          assert 'There is a "," in the equation!'
                  Yhat = eval(eqTmp)
              except Exception as e:
                  continue
                  print(f"PR: Invalid equation encountered. Eq: {eqTmp}, Reason: {e}")
                  Yhat = 100  # Assign a default value or handle as needed
              Yhats.append(Yhat)
            err = relativeErr(Ys,Yhats, info=True)


            results['target'].append(eq)
            results['predicted'].append(predicted)
            results['error'].append(err)

            print(f"\nSample {batch_idx * batch_size + i + 1}:")
            print(f"Target: {eq}")
            print(f"Predicted: {predicted}")
            print(f"Relative Error: {err:.6f}")
            print("-" * 50)

    return results

In [None]:
pconfig = PointNetConfig(
    embeddingSize=n_embd,
    numberofPoints=250,
    numberofVars=1,
    numberofYs=1,
)
model = SymbolicFlowMatching(
       pconfig=pconfig,
       vocab_size=train_dataset.vocab_size,
       max_seq_len=blockSize,
       padding_idx=train_dataset.paddingID,
       max_num_vars=9,
       n_layer=6,
       n_head=4,
       n_embd=n_embd,
   ).to(device)


print(train_dataset.itos)

model_path = "flow_matching.pth"
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

print("Testing SymbolicDiffusion model...")
test_results = test_model(model, test_loader, textTest, train_dataset, device)

print("\nSummary:")
for i in range(len(test_results['target'])):
    print(f"Sample {i+1}:")
    print(f"  Target: {test_results['target'][i]}")
    print(f"  Predicted: {test_results['predicted'][i]}")
    print(f"  Error: {test_results['error'][i]:.6f}")

{0: '\n', 1: ' ', 2: '"', 3: '(', 4: ')', 5: '*', 6: '+', 7: ',', 8: '-', 9: '.', 10: '/', 11: '0', 12: '1', 13: '2', 14: '3', 15: '4', 16: '5', 17: '6', 18: '7', 19: '8', 20: '9', 21: ':', 22: ':', 23: '<', 24: '>', 25: 'C', 26: 'E', 27: 'Q', 28: 'S', 29: 'T', 30: 'X', 31: 'Y', 32: '[', 33: ']', 34: '_', 35: 'c', 36: 'e', 37: 'g', 38: 'i', 39: 'k', 40: 'l', 41: 'n', 42: 'o', 43: 'p', 44: 's', 45: 't', 46: 'x', 47: '{', 48: '}'}
Testing SymbolicDiffusion model...


  model.load_state_dict(torch.load(model_path, map_location=device))


True embeddings:  tensor([0., 0., 0., 0., 0.], device='cuda:0')

Sample 1:
Target: C*x1+C
Predicted: 0888030g080860800_036_0686308068
Relative Error: 100.000000
--------------------------------------------------
True embeddings:  tensor([0., 0., 0., 0., 0.], device='cuda:0')

Sample 2:
Target: C*log(C*sin(C*x1+C))+C
Predicted: 000_083_80_0_00__8066086_6_36080
Relative Error: 100.000000
--------------------------------------------------
True embeddings:  tensor([0., 0., 0., 0., 0.], device='cuda:0')

Sample 3:
Target: C*x1**4+C*x1**3+C*x1**2+C
Predicted: 880305_/03_8_0__60__88_8_0_88_08
Relative Error: 100.000000
--------------------------------------------------
True embeddings:  tensor([0., 0., 0., 0., 0.], device='cuda:0')

Sample 4:
Target: C*exp(C*x1)+C
Predicted: 6_g0_6633833
Relative Error: 100.000000
--------------------------------------------------
True embeddings:  tensor([0., 0., 0., 0., 0.], device='cuda:0')

Sample 5:
Target: C*x1+C*exp(C*x1)/C*log(C*x1+C)+C
Predicted: 068