# Point Embedding

In [None]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

# Folder names
folder_path = "small_dataset/train"  
file_name = "f_0.json"        

# Construct the full file path
file_path = os.path.join(folder_path, file_name)

Peek at the data.

In [2]:
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
except FileNotFoundError:
    print(f"{file_name} doesn't exist at {folder_path}.")
except json.JSONDecodeError:
    print(f"Couldn't decode {file_name}.")

for key, value in data.items():
    print(key)

formula
formula_depth
points
n_vars
n_consts
n_points
var_bound_dict
const_value_dict
meta_list


In [3]:
print(data['points'])

{'var_0': [-2.79008, -1.43651, -1.39172, 0.24824, 3.63193, -1.02878, -3.51268, 0.03997, -4.64088, -1.57417, 0.13335, 0.17542, 3.20014, 3.0211, -2.15762, -1.9855, 4.81394, -1.10188, -3.58234, 1.55946, -2.54935, 1.04478, 3.21744, -3.25277, -0.4874, -1.18187, 4.95992, 2.10747, -4.20734, -2.70941, -3.44772, -4.85176, -1.84627, -1.79998, -2.6889, 0.54344, -2.81412, 4.43828, 2.33473, -3.79659, -2.3523, 4.84024, -2.96959, -3.87053, -2.67405, 2.13423, 0.13502, -4.34976, -1.37471, -4.10443, -2.09117, -3.2892, -2.79626, 2.79426, 2.82545, 2.32883, 1.91456, -2.57005, -0.5899, -0.40422, 0.34098, 3.55448, 3.88414, -2.8427, -0.69123, -3.50898, 1.47471, 0.70889, 3.49245, 0.20351, 2.78136, -1.81472, 1.95154, -4.64605, -3.50091, -0.5112, 3.90975, 3.83586, 0.96975, -3.71553, 1.82143, -3.2294, -1.49144, -1.84215, 3.87718, 1.89716, -4.27646, 3.45696, -0.68395, 2.54025, 2.79105, 4.97805, -1.52602, 3.62118, -2.13553, 4.98068, -0.87596, 1.61298, 4.99432, -2.97843], 'var_1': [1.23101, 4.99008, 4.44332, -2.8056

Create a pytorch tensor with batch size num_files containing each equation's set of points. Each set of input points is represented by a 100 x 4 matrix [x_0 x_1 x_2 y], where x_0 is an array of 100 values and similarly for the rest.

In [4]:
# Number of json files to read
num_files = 16

for i in range(num_files):
    folder_path = "small_dataset/train"
    file_name = f"f_{i}.json"

    # File path
    file_path = os.path.join(folder_path, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
    except FileNotFoundError:
        print(f"{file_name} doesn't exist at {folder_path}.")
    except json.JSONDecodeError:
        print(f"Couldn't decode {file_name}.")
        
    point_data = [data['points']['var_0'], data['points']['var_1'], data['points']['var_2'], data['points']['target']]
    
    input_data = {
        "pointwise_data": point_data
    }
    
    if i == 0:
        train_points = torch.tensor(input_data['pointwise_data'], dtype=torch.float32).unsqueeze(0)

    else:
        points = torch.tensor(input_data['pointwise_data'], dtype=torch.float32).unsqueeze(0)
        train_points = torch.cat((train_points, points), dim=0)

print(train_points.shape) # [batch size, channels, sequence length]    

torch.Size([16, 4, 100])


This is the implementation from SymbolicGPT.

In [5]:
numVars = 3 # number of x variables
numYs = 1 # number of y variables
embeddingSize = 512
embeddingSize = 384
num_units = embeddingSize

# Define the embedding model
class EmbeddingModel(nn.Module):
    def __init__(self):
        super(EmbeddingModel, self).__init__()
        
        self.activation_func = F.relu
        self.num_units = embeddingSize

        self.conv1 = nn.Conv1d(numVars + numYs, num_units, 1)
        self.conv2 = nn.Conv1d(num_units, 2 * num_units, 1)
        self.conv3 = nn.Conv1d(2 * num_units, 4 * num_units, 1)
        self.fc1 = nn.Linear(4 * num_units, 2 * num_units)
        self.fc2 = nn.Linear(2 * num_units, num_units)

        self.input_batch_norm = nn.BatchNorm1d(numVars + numYs)

        self.bn1 = nn.BatchNorm1d(self.num_units)
        self.bn2 = nn.BatchNorm1d(2 * self.num_units)
        self.bn3 = nn.BatchNorm1d(4 * self.num_units)
        self.bn4 = nn.BatchNorm1d(2 * self.num_units)
        self.bn5 = nn.BatchNorm1d(self.num_units)

    
    def forward(self, x):
        x = self.input_batch_norm(x)
        print(x.shape)
        x = self.activation_func(self.bn1(self.conv1(x)))
        print(x.shape)
        x = self.activation_func(self.bn2(self.conv2(x)))
        print(x.shape)
        x = self.activation_func(self.bn3(self.conv3(x)))
        print(x.shape)
        x, _ = torch.max(x, dim=2)  
        print(x.shape)

        x = self.activation_func(self.bn4(self.fc1(x)))
        print(x.shape)
        x = self.activation_func(self.bn5(self.fc2(x)))
        print(x.shape)
        
        return x

In [6]:
# Instantiate the model
model = EmbeddingModel()

# Get embeddings
embeddings = model(train_points)

torch.Size([16, 4, 100])
torch.Size([16, 384, 100])
torch.Size([16, 768, 100])
torch.Size([16, 1536, 100])
torch.Size([16, 1536])
torch.Size([16, 768])
torch.Size([16, 384])


The output is a vector of size 1 x e, where e is the embedding size. In this example, we use e = 512. 

# Token Embedding

Next is the equation tokenization.

In [7]:
# Number of json files to read
num_files = 20

equations = ""

formulas = []

for i in range(num_files):
    folder_path = "small_dataset/train"
    file_name = f"f_{i}.json"

    # Fill path
    file_path = os.path.join(folder_path, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
    except FileNotFoundError:
        print(f"{file_name} doesn't exist at {folder_path}.")
    except json.JSONDecodeError:
        print(f"Couldn't decode {file_name}.")
        
    eq = data['formula']
    
    equations = ''.join([eq,equations])
    
    formula_data = eq
    
    formulas.append(formula_data)

print(equations) 

add(exp(reverse(var_0, N), N(N, N)), gaussian(add(var_2, var_1), N(N, N)))mult(gaussian(add(var_1, var_0), N(N, N)), exp(mult(var_2, C_0), N(N, N)))add(sqrt(mult(var_1, var_2), N(N, N)), pow_2(sqrt(var_0, N), N(N, N)))add(mult(sinh(var_1, N), var_2(N, N)), reverse(tan(var_0, N), N(N, N)))log(add(mult(var_1, C_0), mult(var_2, var_0)), N(N(N, N), N(N, N)))mult(cosh(reverse(var_1, N), N(N, N)), reverse(mult(var_0, var_2), N(N, N)))add(sqrt(gaussian(var_0, N), N(N, N)), exp(add(var_1, var_2), N(N, N)))add(sqrt(add(var_2, C_0), N(N, N)), reverse(add(var_1, var_0), N(N, N)))mult(add(sin(var_1, N), reverse(var_2, N)), add(log(var_0, N), log(var_1, N)))mult(add(sinh(var_1, N), add(var_2, var_2)), sqrt(add(var_0, C_0), N(N, N)))sqrt(add(var_2(N, N), add(var_1, var_0)), N(N(N, N), N(N, N)))mult(add(pow_2(var_0, N), var_2(N, N)), exp(sin(var_1, N), N(N, N)))add(add(exp(var_1, N), mult(var_2, C_0)), tan(gaussian(var_0, N), N(N, N)))gaussian(mult(cos(var_0, N), mult(var_2, var_1)), N(N(N, N), N(N, 

In [8]:
print(formulas[0])

mult(log(add(var_2, var_0), N(N, N)), reverse(gaussian(var_1, N), N(N, N)))


Equations is just a big long string containing every equation.

In [9]:
chars = sorted(list(set(equations))+['_','T','<','>',':'])
print(chars)
print(len(chars))

[' ', '(', ')', ',', '0', '1', '2', ':', '<', '>', 'C', 'N', 'T', '_', '_', 'a', 'c', 'd', 'e', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x']
35


These dictionaries define the mapping from string to token and vice versa.

In [10]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

print(stoi)

{' ': 0, '(': 1, ')': 2, ',': 3, '0': 4, '1': 5, '2': 6, ':': 7, '<': 8, '>': 9, 'C': 10, 'N': 11, 'T': 12, '_': 14, 'a': 15, 'c': 16, 'd': 17, 'e': 18, 'g': 19, 'h': 20, 'i': 21, 'l': 22, 'm': 23, 'n': 24, 'o': 25, 'p': 26, 'q': 27, 'r': 28, 's': 29, 't': 30, 'u': 31, 'v': 32, 'w': 33, 'x': 34}


In [11]:
print(itos)

{0: ' ', 1: '(', 2: ')', 3: ',', 4: '0', 5: '1', 6: '2', 7: ':', 8: '<', 9: '>', 10: 'C', 11: 'N', 12: 'T', 13: '_', 14: '_', 15: 'a', 16: 'c', 17: 'd', 18: 'e', 19: 'g', 20: 'h', 21: 'i', 22: 'l', 23: 'm', 24: 'n', 25: 'o', 26: 'p', 27: 'q', 28: 'r', 29: 's', 30: 't', 31: 'u', 32: 'v', 33: 'w', 34: 'x'}


In [12]:
dix = [stoi[s] for s in '<'+eq+'>']

In [13]:
print(dix)

[8, 15, 17, 17, 1, 18, 34, 26, 1, 28, 18, 32, 18, 28, 29, 18, 1, 32, 15, 28, 14, 4, 3, 0, 11, 2, 3, 0, 11, 1, 11, 3, 0, 11, 2, 2, 3, 0, 19, 15, 31, 29, 29, 21, 15, 24, 1, 15, 17, 17, 1, 32, 15, 28, 14, 6, 3, 0, 32, 15, 28, 14, 5, 2, 3, 0, 11, 1, 11, 3, 0, 11, 2, 2, 2, 9]


It seems like dix is the tokenized representation of an equation.

In [14]:
dummy = [s for s in '<'+eq+'>']
print(dummy)

['<', 'a', 'd', 'd', '(', 'e', 'x', 'p', '(', 'r', 'e', 'v', 'e', 'r', 's', 'e', '(', 'v', 'a', 'r', '_', '0', ',', ' ', 'N', ')', ',', ' ', 'N', '(', 'N', ',', ' ', 'N', ')', ')', ',', ' ', 'g', 'a', 'u', 's', 's', 'i', 'a', 'n', '(', 'a', 'd', 'd', '(', 'v', 'a', 'r', '_', '2', ',', ' ', 'v', 'a', 'r', '_', '1', ')', ',', ' ', 'N', '(', 'N', ',', ' ', 'N', ')', ')', ')', '>']


In [15]:
inputs = dix[:-1]
outputs = dix[1:]

print(inputs)
print(len(inputs))

[8, 15, 17, 17, 1, 18, 34, 26, 1, 28, 18, 32, 18, 28, 29, 18, 1, 32, 15, 28, 14, 4, 3, 0, 11, 2, 3, 0, 11, 1, 11, 3, 0, 11, 2, 2, 3, 0, 19, 15, 31, 29, 29, 21, 15, 24, 1, 15, 17, 17, 1, 32, 15, 28, 14, 6, 3, 0, 32, 15, 28, 14, 5, 2, 3, 0, 11, 1, 11, 3, 0, 11, 2, 2, 2]
75


In SymbolicGPT, the encoded equations look like this: tensor([23, 25,  5, 50, 13,  5,  5, 16,  6, 25,  5, 50, 13,  5,  5, 15,  6, 25,
         5, 50, 13,  5,  5, 14,  6, 25,  5, 50, 13,  5,  5, 13,  6, 25,  5, 50,
        13,  6, 25, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
        34, 34, 34, 34, 34, 34, 34, 34, 34, 34])

The way they get these equations is as follows: cap the length of the token sequence by the prespecified block size. For example, block size is 64. Before padding, the input length is 39, so the padding size is 25. Thus, the sequence is length 64. Similarly, truncate the sequence down if the length exceeds 64.

In [16]:
print(outputs)

[15, 17, 17, 1, 18, 34, 26, 1, 28, 18, 32, 18, 28, 29, 18, 1, 32, 15, 28, 14, 4, 3, 0, 11, 2, 3, 0, 11, 1, 11, 3, 0, 11, 2, 2, 3, 0, 19, 15, 31, 29, 29, 21, 15, 24, 1, 15, 17, 17, 1, 32, 15, 28, 14, 6, 3, 0, 32, 15, 28, 14, 5, 2, 3, 0, 11, 1, 11, 3, 0, 11, 2, 2, 2, 9]


It's not clear immediately what vocab_size is, but it appears to be len(chars), where chars is the list of unique characters found across all the equations. 

Formulas need to be tokenized, embedded, and then concatenated because their number of characters will initially differ.

The embedding stem takes a padding index as an argument, which is determined by 
self.paddingToken = '_', self.paddingID = self.stoi[self.paddingToken], where paddingID was assigned to the train dataset.


In [17]:
paddingToken = '_'
paddingID = stoi[paddingToken]
print(paddingID)

14


For now, we'll follow their convention and cap token length at 64.

In [18]:
formula_data = [eq]
max_length = 64
    
char_data = {
    "char_data": formula_data
}

for i in range(len(formulas)):
    dix = [stoi[s] for s in '<'+formulas[i]+'>']
    inputs = dix[:-1] # dunno why this is needed
    
    paddingSize = max(max_length-len(inputs),0)
    paddingList = [paddingID]*paddingSize
    
    # Pad or truncate as needed
    inputs += paddingList
    inputs = inputs[:max_length]
    
    char_data = {
        "char_data": inputs
    }
    
    if i == 0:
        train_chars = torch.tensor(char_data['char_data'], dtype=torch.int32).unsqueeze(0)

    else:
        chars = torch.tensor(char_data['char_data'], dtype=torch.int32).unsqueeze(0)
        train_chars = torch.cat((train_chars, chars), dim=0)

print(train_chars)
print(train_chars.shape)

tensor([[ 8, 23, 31,  ...,  3,  0, 11],
        [ 8, 23, 31,  ..., 14,  6,  3],
        [ 8, 15, 17,  ..., 11,  3,  0],
        ...,
        [ 8, 15, 17,  ...,  0, 11,  1],
        [ 8, 23, 31,  ..., 14,  4,  2],
        [ 8, 15, 17,  ..., 14,  5,  2]], dtype=torch.int32)
torch.Size([20, 64])


Presumably, self.tok_emb(idx) is passing the embedding layer on dix, the sequence of tokens. So it goes 
add(exp(reverse(var_0, N), N(N, N)) => [8, 15, ... , 2, 2] => [0.01, ... , 0.99]. At the end, the logits should yield tokens, which can then be decoded to an equation.

In [19]:
class GPT(nn.Module):
    """  the full GPT language model, with a context size of block_size """

    def __init__(self):
        super().__init__()

        vocabSize = 35
        blockSize = 64 # use block size 64 for now
        drop_out = 0.1

        # input embedding stem
        self.tok_emb = nn.Embedding(vocabSize, embeddingSize, padding_idx=paddingID) # padding ID = 14    
        self.pos_emb = nn.Parameter(torch.zeros(1, blockSize, embeddingSize))
        self.drop = nn.Dropout(drop_out)        
        
    def forward(self, idx):
        b, t = idx.size()

        # forward the GPT model
        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector -> b x length x embedding
        position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector

        # summation
        input_embedding = token_embeddings + position_embeddings #+ points_embeddings
        
        return input_embedding

In [20]:
# Instantiate the model
gpt_model = GPT()

# Get embeddings
embeddings = gpt_model(train_chars)
print(embeddings.shape)

torch.Size([20, 64, 384])


In [22]:
first_channel_first_batch = embeddings[0, 0, :]

# Print the result
print(first_channel_first_batch)

tensor([ 6.4439e-02,  7.7788e-01,  1.4444e-01,  3.2463e-01,  3.5825e-01,
        -2.1916e+00,  3.1082e-01,  3.0723e-01, -1.7287e+00, -1.0041e+00,
        -1.2744e+00,  5.5562e-01,  1.2556e+00,  5.1207e-01, -5.4144e-03,
         2.8505e-01,  6.2066e-01,  3.9447e-01,  7.5440e-01, -3.3477e-01,
        -1.8021e-01, -3.1248e-02, -6.0185e-02, -1.2717e+00,  7.6769e-01,
         1.2364e+00, -7.5234e-01, -1.7104e-02,  4.5301e-01, -1.7334e-02,
        -1.9353e+00, -1.2685e+00, -8.0235e-01, -6.5134e-01, -3.2802e-01,
         1.7287e+00, -4.8262e-01, -1.3007e+00, -9.4437e-01, -1.6297e+00,
         9.3883e-01,  1.6431e-01,  1.0406e+00,  1.2632e+00,  1.8008e-02,
        -5.3831e-01,  5.7271e-01,  1.3792e+00,  1.0563e+00, -1.1543e+00,
        -8.8845e-01, -1.1591e+00, -1.2048e-01,  2.3298e+00,  3.9128e-01,
         4.8654e-01, -4.5392e-01,  1.8232e-01,  9.5347e-02,  4.4730e-01,
        -9.6029e-01, -5.3801e-01,  5.0961e-01, -4.8260e-01, -5.6868e-02,
        -3.5467e-01, -1.1205e+00,  1.1934e-01, -8.8

These dimensions should now be ready for use.