In [13]:
import pandas as pd
import numpy as np

# Define ranges and distributions
data_config = {
    "Age": {"range": (20, 90), "distribution": "uniform"},  # Continuous
    "Age_normalized": {"values": [0, 1, 2, 3, 4], "weights": [0.1, 0.2, 0.4, 0.2, 0.1]},  # Discrete
    "Sex": {"values": [0, 1, 2], "weights": [0.8, 0.15, 0.05]},  # Discrete
    "Smoking": {"values": [0, 1, 2], "weights": [0.7, 0.2, 0.1]},  # Discrete
    "Survival": {"values": [0, 1], "weights": [0.6, 0.4]},  # Binary
    "Sur_time_mo": {"range": (1, 60), "distribution": "normal", "mean": 20, "std": 10},  # Continuous
    "Prog_time_mo": {"range": (1, 60), "distribution": "normal", "mean": 15, "std": 7},  # Continuous
    "VDW": {"range": (-60, -45), "distribution": "uniform"},  # Continuous
    "EEL": {"range": (-26.65, -11), "distribution": "uniform"},  # Continuous
    "EGB": {"range": (27, 40), "distribution": "uniform"},  # Continuous
    "EEL.1": {"range": (-26.65, -11), "distribution": "uniform"},  # Continuous
    "EPB": {"range": (27, 40), "distribution": "uniform"},  # Continuous
    "ENPOLAR": {"range": (-45, -1), "distribution": "uniform"},  # Continuous
    "TOTAL.1": {"range": (-76, -36), "distribution": "uniform"},  # Continuous
    "Matching_rates": {"range": (0, 17), "distribution": "uniform"},  # Continuous
    "Centroid_distance": {"range": (30, 39), "distribution": "uniform"},  # Continuous
    "Connectivity": {"values": range(0, 24), "weights": [0.05]*10 + [0.15]*8 + [0.05]*6},  # Discrete
    "Convex_atoms": {"values": range(0, 44), "weights": [0.02]*10 + [0.3]*20 + [0.1]*14},  # Discrete
    "Hydrogen_bonds": {"range": (775, 1650), "distribution": "uniform"},  # Discrete
}
for age in df.Age:
    age = round(age)

# Define response classes and their probabilities
responses = ["stable", "no", "complete", "partial"]
response_weights = [0.3, 0.1, 0.2, 0.4]  # Weighted probabilities for responses

# Normalize weights function
def normalize_weights(weights):
    return np.array(weights) / np.sum(weights)

# Helper function for generating data
def generate_data(config, size):
    if "values" in config:
        normalized_weights = normalize_weights(config["weights"])
        return np.random.choice(config["values"], size=size, p=normalized_weights)
    elif config["distribution"] == "uniform":
        return np.round(np.random.uniform(*config["range"], size), 3)
    elif config["distribution"] == "normal":
        low, high = config["range"]
        data = np.random.normal(config["mean"], config["std"], size)
        return np.clip(np.round(data, 3), low, high)

# Generate the dataset
num_rows = 1000
data = {}

for column, config in data_config.items():
    data[column] = generate_data(config, num_rows)

# Add the Response column
data["Response"] = np.random.choice(responses, size=num_rows, p=normalize_weights(response_weights))



# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV (optional)
df.to_csv("weighted_dataset.csv", index=False)

# Preview the dataset
print(df.head())


      Age  Age_normalized  Sex  Smoking  Survival  Sur_time_mo  Prog_time_mo  \
0  73.760               2    0        0         0       29.222        13.907   
1  32.030               0    0        0         0       24.747        17.218   
2  53.530               2    2        0         1       22.259        21.963   
3  77.935               2    0        0         0       22.788        18.525   
4  50.154               1    0        0         0       17.457        32.208   

      VDW     EEL     EGB   EEL.1     EPB  ENPOLAR  TOTAL.1  Matching_rates  \
0 -51.708 -12.515  28.659 -14.801  36.227  -32.927  -44.012           3.796   
1 -52.861 -14.895  38.632 -18.186  37.867  -18.550  -54.491           3.832   
2 -57.020 -12.777  34.613 -13.325  33.494  -41.389  -45.819          14.055   
3 -46.520 -14.001  38.270 -17.615  35.406   -1.833  -44.235           0.934   
4 -57.592 -22.244  29.151 -14.530  35.052  -30.630  -68.458           1.855   

   Centroid_distance  Connectivity  Convex_a

In [16]:
# prompt: make the age rounded

import pandas as pd
import numpy as np

# Define ranges and distributions
data_config = {
    "Age": {"range": (20, 90), "distribution": "uniform"},  # Continuous
    "Age_normalized": {"values": [0, 1, 2, 3, 4], "weights": [0.1, 0.2, 0.4, 0.2, 0.1]},  # Discrete
    "Sex": {"values": [0, 1, 2], "weights": [0.8, 0.15, 0.05]},  # Discrete
    "Smoking": {"values": [0, 1, 2], "weights": [0.7, 0.2, 0.1]},  # Discrete
    "Survival": {"values": [0, 1], "weights": [0.6, 0.4]},  # Binary
    "Sur_time_mo": {"range": (1, 60), "distribution": "normal", "mean": 20, "std": 10},  # Continuous
    "Prog_time_mo": {"range": (1, 60), "distribution": "normal", "mean": 15, "std": 7},  # Continuous
    "VDW": {"range": (-60, -45), "distribution": "uniform"},  # Continuous
    "EEL": {"range": (-26.65, -11), "distribution": "uniform"},  # Continuous
    "EGB": {"range": (27, 40), "distribution": "uniform"},  # Continuous
    "EEL.1": {"range": (-26.65, -11), "distribution": "uniform"},  # Continuous
    "EPB": {"range": (27, 40), "distribution": "uniform"},  # Continuous
    "ENPOLAR": {"range": (-45, -1), "distribution": "uniform"},  # Continuous
    "TOTAL.1": {"range": (-76, -36), "distribution": "uniform"},  # Continuous
    "Matching_rates": {"range": (0, 17), "distribution": "uniform"},  # Continuous
    "Centroid_distance": {"range": (30, 39), "distribution": "uniform"},  # Continuous
    "Connectivity": {"values": range(0, 24), "weights": [0.05]*10 + [0.15]*8 + [0.05]*6},  # Discrete
    "Convex_atoms": {"values": range(0, 44), "weights": [0.02]*10 + [0.3]*20 + [0.1]*14},  # Discrete
    "Hydrogen_bonds": {"range": (775, 1650), "distribution": "uniform"},  # Discrete
}

# Normalize weights function
def normalize_weights(weights):
    return np.array(weights) / np.sum(weights)

# Helper function for generating data
def generate_data(config, size):
    if "values" in config:
        normalized_weights = normalize_weights(config["weights"])
        return np.random.choice(config["values"], size=size, p=normalized_weights)
    elif config["distribution"] == "uniform":
        return np.round(np.random.uniform(*config["range"], size), 3)
    elif config["distribution"] == "normal":
        low, high = config["range"]
        data = np.random.normal(config["mean"], config["std"], size)
        return np.clip(np.round(data, 3), low, high)

# Generate the dataset
num_rows = 1000
data = {}

for column, config in data_config.items():
    data[column] = generate_data(config, num_rows)

# Add the Response column
responses = ["stable", "no", "complete", "partial"]
response_weights = [0.3, 0.1, 0.2, 0.4]  # Weighted probabilities for responses
data["Response"] = np.random.choice(responses, size=num_rows, p=normalize_weights(response_weights))

# Create a DataFrame
df = pd.DataFrame(data)

# Round the 'Age' column
df["Age"] = df["Age"].astype(int)

# Save to CSV (optional)
df.to_csv("weighted_dataset.csv", index=False)

# Preview the dataset
print(df.head())

   Age  Age_normalized  Sex  Smoking  Survival  Sur_time_mo  Prog_time_mo  \
0   29               2    1        0         0       25.566        24.443   
1   44               1    0        0         1        2.764        18.864   
2   86               2    0        0         0       17.622        18.851   
3   68               2    0        1         0        3.521        17.452   
4   42               2    0        0         0       32.143        22.249   

      VDW     EEL     EGB   EEL.1     EPB  ENPOLAR  TOTAL.1  Matching_rates  \
0 -57.361 -12.770  38.419 -24.694  31.761  -15.450  -47.465          15.747   
1 -53.290 -18.819  33.655 -19.424  34.521  -26.418  -46.039          11.347   
2 -54.518 -22.903  27.344 -12.160  27.015  -34.226  -62.259           0.426   
3 -48.409 -11.154  27.485 -13.111  29.369   -7.791  -58.641           3.175   
4 -45.039 -17.939  32.444 -18.980  30.977  -17.998  -49.735          14.481   

   Centroid_distance  Connectivity  Convex_atoms  Hydrogen_bon