In [62]:
import numpy as np

np.random.seed(42)

In [63]:
import pandas as pd

df = pd.read_csv("./dataset_v1.csv")

In [64]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [65]:
categorical_features = [
    "bond_type",
    "dimensions",
    "linkerA",
    "linkerB",
    "net",
    "chemical_formula",
    "vertices",
    "edges",
    "genus",
]

df.drop(categorical_features, axis=1, inplace=True)

In [66]:
target_variables =[
    "heatDesorptionHigh",
    "heatDesorptionHigh_Error",
    "highUptake_molec",
    "highUptakeError_molec",
    "highUptakeError_mol",
    "heatDesorptionLow",
    "heatDesorptionLow_error",
    "lowUptake_molec",
    "lowUptakeError_molec",
    "lowUptake_mol",
    "lowUptakeError_mol",
    "surface_area",
    "del_capacity"
]

df.drop(target_variables, axis=1, inplace=True)

In [67]:
best_rows = df.nlargest(100, "highUptake_mol").index
additional_rows = np.random.choice(df.index.difference(best_rows), size=900, replace=False)
combined_indices = np.concatenate((best_rows, additional_rows))

df_test = df.loc[combined_indices]
df_test.to_csv("./dataset_test.csv", index=False)

df.drop(index=combined_indices, inplace=True)

In [68]:
# Make sure you assign NaNs only to float64 columns
float64_columns = list(
    set(df.columns).difference(df.select_dtypes(exclude=["float64"]).columns.tolist())
)

# Select random indices (rows) to introduce NaNs
nan_indices = np.random.choice(df.index, size=int(0.125 * len(df)), replace=False)

# For each selected row, choose a random number of columns to assign NaNs
for idx in nan_indices:
    # Randomly choose the number of columns to assign NaNs (at least 1 column, at most all columns)
    num_nans = np.random.binomial(len(df.columns), 0.1) + 1  # Bias towards fewer NaNs

    # Randomly choose which columns to assign NaNs
    nan_columns = np.random.choice(float64_columns, size=num_nans, replace=False)

    # Assign NaNs to the selected columns of the current row
    df.loc[idx, nan_columns] = np.nan

# Display the first few rows of the DataFrame with NaNs
df.head()


Unnamed: 0,voidFraction,supercellVolume,density,highUptake_mol,cell_a,cell_b,cell_c,alpha_deg,beta_deg,gamma_deg,num_carbon,num_fluorine,num_hydrogen,num_nitrogen,num_oxygen,num_sulfur,num_silicon,largest_incl_sphere,largest_free_sphere,largest_incl_sphere_along_path
0,0.90012,49204.128057,260.213228,30.256085,43.4422,43.4422,30.1056,90.0,90.0,120.0,360,0,216,144,72,0,0,17.19014,15.64961,17.19004
1,0.879234,49390.074419,297.963387,28.27847,43.5205,43.5204,30.1108,90.0,90.0,120.0,360,0,216,144,144,0,0,17.34916,15.76943,17.34916
2,0.858269,50036.985281,289.397249,29.259086,43.7829,43.7829,30.1406,90.0,90.0,120.0,432,0,360,144,72,0,0,16.84032,15.61907,16.84024
3,0.857065,49135.924517,370.063633,23.500406,43.4319,43.4318,30.0782,90.0,90.0,120.0,360,0,144,216,216,0,0,13.93085,12.32167,13.93085
4,0.858016,49540.680132,367.040151,23.128289,43.6159,43.6159,30.0706,90.0,90.0,120.0,360,0,144,216,216,0,0,16.06923,13.48791,16.06921


In [69]:
df.to_csv("./dataset.csv", index=False)