### Importación de Librerías

In [2]:
import pandas as pd
import numpy as np
import uuid
import random
from itertools import product

### Creación de Funciones Secundarias

In [3]:
def generate_unique_id():
    return str(uuid.uuid4())[:16]
def get_random_categories(categories, num_records):
    return np.random.choice(categories, num_records)
def get_random_numbers(min_value, max_value, num_records):
    return np.random.uniform(min_value, max_value, num_records)
def get_foreign_values(foreign_df, foreign_column, num_records):
    return np.random.choice(foreign_df[foreign_column].tolist(), num_records)

def get_categorical_dataset_simulated(simulation_extended, category_cols, num_records):
    category_values = [simulation_extended[col].unique() for col in category_cols]
    combinations = list(product(*category_values))
    simulated_data = np.random.choice(len(combinations), num_records)
    simulated_df = pd.DataFrame([combinations[i] for i in simulated_data], columns=category_cols)
    return simulated_df

def get_numeric_column_simulated(simulated_df, simulation_extended, category_cols, numeric_col):
    grouped_stats = simulation_extended.groupby(category_cols)[numeric_col].agg(['mean', 'std']).reset_index()
    simulated_values = pd.merge(simulated_df, grouped_stats, on=category_cols, how='left')
    
    simulated_values[numeric_col] = simulated_values.apply(
        lambda row: np.random.normal(row['mean'], row['std']), axis=1
    )
    
    return simulated_values[[numeric_col]]

### Creación de Función Build DataFrames

In [4]:
def build_dataframes(conf_list):
    dfs = {}
    pending = conf_list.copy()
    processed = set()
    
    while pending:
        for conf in pending:
            can_process = True
            for col in conf["columns"]:
                if col["type"] == "foreign":
                    foreign_dataset = col["values"].split('.')[0]
                    if foreign_dataset not in dfs:
                        can_process = False
                        break
            
            if can_process:
                data = {}
                num_records = conf.get("random_rows", 1000)

                for col in conf["columns"]:
                    col_type = col["type"]
                    col_name = col["name"]

                    if col_type == "category":
                        data[col_name] = get_random_categories(col["values"], num_records)
                    elif col_type == "numeric":
                        if "std" in col["values"]:
                            data[col_name] = np.random.normal(
                                col["values"]["mean"],
                                col["values"]["std"],
                                num_records
                            )
                            data[col_name] = np.clip(data[col_name], col["values"]["min"], col["values"]["max"])
                        else:
                            data[col_name] = get_random_numbers(
                                col["values"]["min"],
                                col["values"]["max"],
                                num_records
                            )
                    elif col_type == "unique":
                        data[col_name] = [generate_unique_id() for _ in range(num_records)]
                    elif col_type == "foreign":
                        foreign_dataset, foreign_column = col["values"].split('.')
                        data[col_name] = get_foreign_values(dfs[foreign_dataset], foreign_column, num_records)

                dfs[conf["ds"]] = pd.DataFrame(data)
                pending.remove(conf)
                processed.add(conf["ds"])

    return dfs

### Creación de Función de Análisis, Combinación y Extensión

In [5]:
def analyze_combinations_and_extend(dfs, conf_list):
    for conf in conf_list:
        df_name = conf["ds"]
        if df_name in dfs:
            df = dfs[df_name]
            for col in conf["columns"]:
                if col["type"] == "category" and "combinations" in col:
                    comb_columns = col["combinations"]
                    unique_combinations = pd.DataFrame(product(*(df[col].unique() for col in comb_columns)), columns=comb_columns)
                    dfs[f"{df_name}_combinations"] = unique_combinations
                    
                    num_records = len(unique_combinations)
                    extended_data = {}
                    for col in conf["columns"]:
                        col_type = col["type"]
                        col_name = col["name"]

                        if col_type == "category" and col_name not in comb_columns:
                            extended_data[col_name] = get_random_categories(col["values"], num_records)
                        elif col_type == "numeric":
                            if "std" in col["values"]:
                                extended_data[col_name] = np.random.normal(
                                    col["values"]["mean"],
                                    col["values"]["std"],
                                    num_records
                                )
                                extended_data[col_name] = np.clip(extended_data[col_name], col["values"]["min"], col["values"]["max"])
                            else:
                                extended_data[col_name] = get_random_numbers(
                                    col["values"]["min"],
                                    col["values"]["max"],
                                    num_records
                                )
                        elif col_type == "unique":
                            extended_data[col_name] = [generate_unique_id() for _ in range(num_records)]
                        elif col_type == "foreign":
                            foreign_dataset, foreign_column = col["values"].split('.')
                            extended_data[col_name] = get_foreign_values(dfs[foreign_dataset], foreign_column, num_records)
                    
                    extended_df = pd.DataFrame(extended_data)
                    dfs[f"{df_name}_extended"] = pd.concat([unique_combinations, extended_df], axis=1)
                    
    return dfs

### Configuración de Datasets

In [6]:
d1 = {
    "ds": "dataset1",
    "columns": [
        {
            "name": "area",
            "type": "category",
            "values": ["TI", "FIN", "HR"],
            "combinations": ["area"]
        },
        {
            "name": "id",
            "type": "unique"
        }
    ],
    "random": False
}

d2 = {
    "ds": "dataset2",
    "columns": [
        {
            "name": "id",
            "type": "unique"
        },
        {
            "name": "area",
            "type": "foreign",
            "values": "dataset1.area"
        },
        {
            "name": "subarea",
            "type": "category",
            "values": ["SA1", "SA2", "SA3", "SA4"],
            "combinations": ["area", "subarea"]
        }
    ],
    "random": False
}

d3 = {
    "ds": "dataset3",
    "columns": [
        {
            "name": "id",
            "type": "unique"
        },
        {
            "name": "subarea",
            "type": "foreign",
            "values": "dataset2.id"
        },
        {
            "name": "income",
            "type": "numeric",
            "values": {"min": 20000, "max": 50000}
        },
        {
            "name": "goal",
            "type": "numeric",
            "values": {"min": 1000, "max": 10000, "mean": 5000, "std": 2000}
        }
    ],
    "random": True,
    "random_rows": 1000
}

### Ejecución de Funciones 

In [7]:
conf_list = [d3, d1, d2]
dataframes = build_dataframes(conf_list)

In [8]:
simulation_extended = dataframes["dataset3"]
category_cols = ["subarea"]
numeric_cols = ["income", "goal"]

final_simulation = get_categorical_dataset_simulated(simulation_extended, category_cols, 100000)

for nc in numeric_cols:
    dfn = get_numeric_column_simulated(final_simulation, simulation_extended, category_cols, nc)
    final_simulation = pd.merge(final_simulation, dfn[[nc]], left_index=True, right_index=True)

print(final_simulation)

                subarea        income          goal
0      ef599f8f-cf3e-40  21586.243527   4276.901486
1      282f081c-94fe-49  26013.667986   9329.122836
2      de52ea12-b645-40  35126.067571   6789.263548
3      123abb81-e4d7-44  21500.475467  14004.189674
4      3d0c1004-fdf8-40           NaN           NaN
...                 ...           ...           ...
99995  92fca9ef-2e27-42  38903.366565   6350.042108
99996  ef599f8f-cf3e-40  21403.577665   3081.976671
99997  f54b78a2-6101-4b           NaN           NaN
99998  bbcc083d-a1cd-48           NaN           NaN
99999  4a3c458d-c30e-4d           NaN           NaN

[100000 rows x 3 columns]
