In [2]:
import sys
import re
from tqdm import tqdm

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from collections import Counter
from joypy import joyplot

sys.path.append("../../.")
from calculus_path_mod.term_engine import *
from calculus_path_mod.reduction_strategy import *
from calculus_path_mod.terms import num_comparison, nat_numbers, arithm_ops, combinators, pairs, logic
from calculus_path_mod.terms.pseudonym import *

from calculus_path_mod.json_serialization import load_terms
from fitter import Fitter, get_common_distributions
from calculus_utils.drawing import draw_steps_displot

# Load Terms filtered by LO & RI

In [3]:
lists_terms_LO = [load_terms(f"../../tests_11_retests/collected_terms/terms_210_filtered_LO_{inx_}.dat") for inx_ in range(20)]

# Collect more terms with normalization process data for LO & LI strategies with terms_LO & terms_RI

In [4]:
def gen_norm_data(terms_list, strategy):
    normalized_terms_dict = dict()
    for term in tqdm(terms_list):
        term_name = term.simple_str()
        normalized_terms_dict[term_name] = []
        term_red_steps = 0
        (step_term, _, _), norm_term = term.one_step_normalize_visual(strategy)
        normalized_terms_dict[term_name].append(step_term.simple_str())

        while norm_term:
            normalized_terms_dict[term_name].append(norm_term.simple_str())
            (step_term, _, _), norm_term = norm_term.one_step_normalize_visual(strategy)

            # computation limitation
            if (step_term.vertices_number > 3_000) or (term_red_steps > 400):
                norm_term = None
    return normalized_terms_dict

In [5]:
list_res_OO = [gen_norm_data(terms_LO, LOStrategy()) for terms_LO in lists_terms_LO]

100%|██████████| 231/231 [00:06<00:00, 33.91it/s]
100%|██████████| 228/228 [00:02<00:00, 86.67it/s] 
100%|██████████| 222/222 [00:02<00:00, 109.40it/s]
100%|██████████| 230/230 [00:04<00:00, 47.42it/s]
100%|██████████| 231/231 [00:03<00:00, 64.64it/s] 
100%|██████████| 228/228 [00:01<00:00, 134.73it/s]
100%|██████████| 233/233 [00:05<00:00, 44.18it/s]
100%|██████████| 229/229 [00:04<00:00, 47.71it/s]
100%|██████████| 227/227 [00:02<00:00, 112.02it/s]
100%|██████████| 230/230 [00:05<00:00, 39.63it/s]
100%|██████████| 220/220 [00:02<00:00, 74.35it/s] 
100%|██████████| 226/226 [00:05<00:00, 37.75it/s] 
100%|██████████| 224/224 [00:02<00:00, 75.41it/s]
100%|██████████| 219/219 [00:02<00:00, 79.07it/s] 
100%|██████████| 219/219 [00:07<00:00, 28.03it/s] 
100%|██████████| 224/224 [00:03<00:00, 60.09it/s] 
100%|██████████| 222/222 [00:10<00:00, 20.77it/s] 
100%|██████████| 228/228 [00:02<00:00, 86.69it/s] 
100%|██████████| 223/223 [00:03<00:00, 57.37it/s] 
100%|██████████| 222/222 [00:01<00:00

# Prepare the dataset

In [6]:
steps_lo = []
simple_terms = []

for res_ in list_res_OO:
    for key_ in res_.keys():
        list_red_steps = res_[key_]
        total_steps = len(list_red_steps) - 1
        for inx_ in range(total_steps + 1):
            if list_red_steps[inx_] not in simple_terms:
                simple_terms.append(list_red_steps[inx_])
                steps_lo.append(total_steps - inx_)

In [8]:
df = pd.DataFrame({"steps_num_lo": steps_lo, "simple_terms": simple_terms})
print(len(df))
df = df.drop_duplicates(subset="simple_terms")
print(len(df))

44568
44568


In [9]:
len(set(df["simple_terms"]))

44568

In [10]:
df.to_csv("./data/steps_simple_term_str_v1.csv", index=False)