In [10]:
import multiprocessing as mp
from tqdm.auto import tqdm

import sys
import os
scripts_path = os.path.abspath(os.path.join("..", "scripts"))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

scripts_path = os.path.abspath(os.path.join("..", "classes"))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)


    
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sympy import simplify, sympify
import data.data_cleaning as dc
from expression import Expression
#Load file data/raw/13k.csv


def apply_chunk(chunk, func):
    """Helper function to apply a function to a chunk of data."""
    return chunk.apply(func)

def parallel_apply(series, func, n_jobs=None):
    """Apply a function to a pandas Series in parallel."""
    n_jobs = mp.cpu_count() if n_jobs is None else n_jobs
    # Split into roughly equal chunks
    chunks = np.array_split(series, n_jobs)
    with mp.Pool(n_jobs) as pool:
        # Use the helper function instead of a lambda
        results = pool.starmap(apply_chunk, [(chunk, func) for chunk in chunks])
    # Concatenate the resulting Series
    return pd.concat(results)

# Load file data/raw/100k.csv
file_path = '../data/raw/1M.csv'
chunk_size = 100000  # Define the chunk size
processed_chunks = []

# Initialize the progress bar
total_rows = sum(1 for _ in open(file_path)) - 1  # Total rows (excluding header)
total_chunks = (total_rows // chunk_size) + 1  # Total number of chunks
with tqdm(total=total_chunks, desc="Processing chunks") as pbar:
    # Read the file in chunks and process each chunk
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Keeping only the equation column
        chunk = chunk[['eq']]
        # Removing rows that didn't work with sympy
        print("Removing rows that didn't work with sympy")
        chunk = chunk[~chunk['eq'].str.contains('ERROR_simplify')]
        # Replacing the constants with letter C using parallel_apply
        print("Replacing the constants with letter C")
        chunk['eq'] = parallel_apply(chunk['eq'], dc.augment_expression)
        # Renaming the column
        print("Renaming the column")
        chunk.rename(columns={'eq': 'infix_expr'}, inplace=True)
        
        # Create a column with prefix expression using parallel_apply
        print("Creating a column with prefix expression")
        chunk['prefix_expr'] = parallel_apply(chunk['infix_expr'], Expression.infix_to_prefix)
        processed_chunks.append(chunk)
        # Update the progress bar
        pbar.update(1)

# Combine all processed chunks into a single DataFrame
temp_df = pd.concat(processed_chunks, ignore_index=True)

# remove duplicates
temp_df = temp_df.drop_duplicates(subset=['infix_expr'])



Processing chunks:   0%|          | 0/14 [00:00<?, ?it/s]

Removing rows that didn't work with sympy
Replacing the constants with letter C


Renaming the column
Creating a column with prefix expression
Removing rows that didn't work with sympy
Replacing the constants with letter C
Renaming the column
Creating a column with prefix expression
Removing rows that didn't work with sympy
Replacing the constants with letter C
Renaming the column
Creating a column with prefix expression
Removing rows that didn't work with sympy
Replacing the constants with letter C
Renaming the column
Creating a column with prefix expression
Removing rows that didn't work with sympy
Replacing the constants with letter C
Renaming the column
Creating a column with prefix expression
Removing rows that didn't work with sympy
Replacing the constants with letter C
Renaming the column
Creating a column with prefix expression
Removing rows that didn't work with sympy
Replacing the constants with letter C
Renaming the column
Creating a column with prefix expression
Removing rows that didn't work with sympy
Replacing the constants with letter C
Renaming the 

In [2]:
from data.parallel_utils import augment_dataframe_parallel

df_augmented = augment_dataframe_parallel(temp_df, expression_col="infix_expr", n_jobs=4)
df_augmented.rename(columns={'simple': 'i_simple'}, inplace=True)
df_augmented.rename(columns={'key_value': 'i_key_value'}, inplace=True)
df_augmented.rename(columns={'delimiter': 'i_delimiter'}, inplace=True)
df_augmented.rename(columns={'minimalist': 'i_minimalist'}, inplace=True)


df_augmented = augment_dataframe_parallel(df_augmented, expression_col="prefix_expr", n_jobs=4)
df_augmented.rename(columns={'simple': 'p_simple'}, inplace=True)
df_augmented.rename(columns={'key_value': 'p_key_value'}, inplace=True)
df_augmented.rename(columns={'delimiter': 'p_delimiter'}, inplace=True)
df_augmented.rename(columns={'minimalist': 'p_minimalist'}, inplace=True)


In [11]:
from data.parallel_utils import augment_dataframe_parallel
df_augmented = augment_dataframe_parallel(temp_df, expression_col="infix_expr", n_jobs=4)
df_augmented.rename(columns={'instruction': 'i_prompt'}, inplace=True)

df_augmented['instruction'] = augment_dataframe_parallel(df_augmented, expression_col="prefix_expr", n_jobs=4)['instruction']
df_augmented.rename(columns={'instruction': 'p_prompt'}, inplace=True)

df_augmented['infix_expr'] = "<startofex>" + df_augmented['infix_expr'] + "<endofex>"
df_augmented['prefix_expr'] = "<startofex>" + df_augmented['prefix_expr'] + "<endofex>"

In [None]:
for row in df_augmented['prefix_expr']:
    print(row)
    print("-"*50)
    

In [None]:
for row in df_augmented['infix_expr']:
    print(row)
    print("-"*50)

In [None]:
# Example: Load a prefix expression from a string
prefix_expression_str = "+ x_1 ** x_5 1/2 * -1 x_7" # Load the first prefix expression
# remove the <startofex> and <endofex> tags
prefix_expression_str = prefix_expression_str.replace("<startofex>", "").replace("<endofex>", "")
expression = Expression.infix_to_prefix(prefix_expression_str)  # Convert to prefix expressio
print(expression.sympy_expression)  # Convert to infix and print

In [12]:
# Split df_augmented into train, validation, and test sets
train_df, temp_df = train_test_split(df_augmented, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

file = os.path.basename(file_path)  # Extract the file name from file_path
train_file_path = f'../data/processed/{file.replace(".csv", "")}/train_{file}'
val_file_path = f'../data/processed/{file.replace(".csv", "")}/val_{file}'
test_file_path = f'../data/processed/{file.replace(".csv", "")}/test_{file}'

# Create directories if they don't exist
os.makedirs(os.path.dirname(train_file_path), exist_ok=True)
os.makedirs(os.path.dirname(val_file_path), exist_ok=True)
os.makedirs(os.path.dirname(test_file_path), exist_ok=True)

# Save the train, validation, and test sets
train_df.to_csv(train_file_path, index=False)
val_df.to_csv(val_file_path, index=False)
test_df.to_csv(test_file_path, index=False)

# Save the processed file
processed_file_path = f'../data/processed/{file.replace(".csv", "")}/{file}'
temp_df.to_csv(processed_file_path, index=False)

In [13]:
from huggingface_hub import HfApi

folder = "1M"
api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path=f"../data/processed/{folder}",
    repo_id="augustocsc/sintetico_final",
    repo_type="dataset",
    path_in_repo= folder,
)

train_1M.csv:   0%|          | 0.00/240M [00:00<?, ?B/s]

1M.csv:   0%|          | 0.00/103M [00:00<?, ?B/s]

test_1M.csv:   0%|          | 0.00/51.4M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

val_1M.csv:   0%|          | 0.00/51.4M [00:00<?, ?B/s]

RuntimeError: Error while uploading '1M/val_1M.csv' to the Hub.

In [50]:
from datasets import load_dataset

# Ensure you are logged in to Hugging Face
# Login using `huggingface-cli login` in the terminal if not already logged in
ds = load_dataset("augustocsc/sintetico", data_dir="10k")



In [60]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel

# Load tokenizer from adapter repo
tokenizer = AutoTokenizer.from_pretrained("augustocsc/Se124M10K")

# Load base GPT2 model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Resize embeddings to match tokenizer
model.resize_token_embeddings(len(tokenizer))

# Load the LoRA adapter
model = PeftModel.from_pretrained(model, "augustocsc/Se124M10K")



adapter_model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

In [72]:
prompt = "Instruction: Generate a mathematical expression using variables ['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8'] and operands ['*', '+', '-', 'asin', 'cos', 'pow', 'sin', 'tan'] and ['C'] as constant.\nExpression: <|startofex|>"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate text
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    eos_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruction: Generate a mathematical expression using variables ['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8'] and operands ['*', '+', '-', 'asin', 'cos', 'pow', 'sin', 'tan'] and ['C'] as constant.
Expression: sin(x_7**C) - C * x_6*x_7 + C * x_1 - C

Expression: asin(exp(x_1))*x_3 - cos((x_1 -


In [74]:
from sympy import sympify

# Example: Load an expression from a string
expression_str = " sin(x_7**C) - C * x_6*x_7 + C * x_1 - C"
expression = sympify(expression_str)
print(expression)

C*x_1 - C*x_6*x_7 - C + sin(x_7**C)
