In [1]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

# Folder names
folder_path = "small_dataset/train"  
file_name = "f_0.json"        

# Construct the full file path
file_path = os.path.join(folder_path, file_name)

Peek at the data.

In [2]:
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
except FileNotFoundError:
    print(f"{file_name} doesn't exist at {folder_path}.")
except json.JSONDecodeError:
    print(f"Couldn't decode {file_name}.")

for key, value in data.items():
    print(key)

formula
formula_depth
points
n_vars
n_consts
n_points
var_bound_dict
const_value_dict
meta_list


In [3]:
print(data['points'])

{'var_0': [-2.79008, -1.43651, -1.39172, 0.24824, 3.63193, -1.02878, -3.51268, 0.03997, -4.64088, -1.57417, 0.13335, 0.17542, 3.20014, 3.0211, -2.15762, -1.9855, 4.81394, -1.10188, -3.58234, 1.55946, -2.54935, 1.04478, 3.21744, -3.25277, -0.4874, -1.18187, 4.95992, 2.10747, -4.20734, -2.70941, -3.44772, -4.85176, -1.84627, -1.79998, -2.6889, 0.54344, -2.81412, 4.43828, 2.33473, -3.79659, -2.3523, 4.84024, -2.96959, -3.87053, -2.67405, 2.13423, 0.13502, -4.34976, -1.37471, -4.10443, -2.09117, -3.2892, -2.79626, 2.79426, 2.82545, 2.32883, 1.91456, -2.57005, -0.5899, -0.40422, 0.34098, 3.55448, 3.88414, -2.8427, -0.69123, -3.50898, 1.47471, 0.70889, 3.49245, 0.20351, 2.78136, -1.81472, 1.95154, -4.64605, -3.50091, -0.5112, 3.90975, 3.83586, 0.96975, -3.71553, 1.82143, -3.2294, -1.49144, -1.84215, 3.87718, 1.89716, -4.27646, 3.45696, -0.68395, 2.54025, 2.79105, 4.97805, -1.52602, 3.62118, -2.13553, 4.98068, -0.87596, 1.61298, 4.99432, -2.97843], 'var_1': [1.23101, 4.99008, 4.44332, -2.8056

Read the files f_0.json, f_1.json, etc. Create a pytorch tensor with batch size num_files containing each equation's set of points. Each set of input points is represented by a 100 x 4 matrix [x_0 x_1 x_2 y], where x_0 is an array of 100 values and similarly for the rest.

In [4]:
# Number of json files to read
num_files = 16

for i in range(num_files):
    folder_path = "small_dataset/train"
    file_name = f"f_{i}.json"

    # File path
    file_path = os.path.join(folder_path, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
    except FileNotFoundError:
        print(f"{file_name} doesn't exist at {folder_path}.")
    except json.JSONDecodeError:
        print(f"Couldn't decode {file_name}.")
        
    point_data = [data['points']['var_0'], data['points']['var_1'], data['points']['var_2'], data['points']['target']]
    
    input_data = {
        "pointwise_data": point_data
    }
    
    if i == 0:
        train_points = torch.tensor(input_data['pointwise_data'], dtype=torch.float32).unsqueeze(0)

    else:
        points = torch.tensor(input_data['pointwise_data'], dtype=torch.float32).unsqueeze(0)
        train_points = torch.cat((train_points, points), dim=0)

print(train_points.shape) # [batch size, channels, sequence length]    

torch.Size([16, 4, 100])


This is the implementation from SymbolicGPT.

In [5]:
numVars = 3 # number of x variables
numYs = 1 # number of y variables
embeddingSize = 512
num_units = embeddingSize

# Define the embedding model
class EmbeddingModel(nn.Module):
    def __init__(self):
        super(EmbeddingModel, self).__init__()
        
        self.activation_func = F.relu
        self.num_units = embeddingSize

        self.conv1 = nn.Conv1d(numVars + numYs, num_units, 1)
        self.conv2 = nn.Conv1d(num_units, 2 * num_units, 1)
        self.conv3 = nn.Conv1d(2 * num_units, 4 * num_units, 1)
        self.fc1 = nn.Linear(4 * num_units, 2 * num_units)
        self.fc2 = nn.Linear(2 * num_units, num_units)

        self.input_batch_norm = nn.BatchNorm1d(numVars + numYs)

        self.bn1 = nn.BatchNorm1d(self.num_units)
        self.bn2 = nn.BatchNorm1d(2 * self.num_units)
        self.bn3 = nn.BatchNorm1d(4 * self.num_units)
        self.bn4 = nn.BatchNorm1d(2 * self.num_units)
        self.bn5 = nn.BatchNorm1d(self.num_units)

    
    def forward(self, x):
        x = self.input_batch_norm(x)
        print(x.shape)
        x = self.activation_func(self.bn1(self.conv1(x)))
        print(x.shape)
        x = self.activation_func(self.bn2(self.conv2(x)))
        print(x.shape)
        x = self.activation_func(self.bn3(self.conv3(x)))
        print(x.shape)
        x, _ = torch.max(x, dim=2)  
        print(x.shape)

        x = self.activation_func(self.bn4(self.fc1(x)))
        print(x.shape)
        x = self.activation_func(self.bn5(self.fc2(x)))
        print(x.shape)
        
        return x

In [6]:
# Instantiate the model
model = EmbeddingModel()

# Get embeddings
embeddings = model(train_points)

torch.Size([16, 4, 100])
torch.Size([16, 512, 100])
torch.Size([16, 1024, 100])
torch.Size([16, 2048, 100])
torch.Size([16, 2048])
torch.Size([16, 1024])
torch.Size([16, 512])


The output is a vector of size 1 x e, where e is the embedding size. In this example, we use e = 512. 