In [None]:
import numpy as np

np.random.seed(42)

In [None]:
import pandas as pd

df = pd.read_csv("./data/dataset.csv")

In [None]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
categorical_features = [
    "bond_type",
    "dimensions",
    "linkerA",
    "linkerB",
    "net",
    "chemical_formula",
    "vertices",
    "edges",
    "genus",
]

df.drop(categorical_features, axis=1, inplace=True)

In [None]:
target_variables =[
    "heatDesorptionHigh",
    "heatDesorptionHigh_Error",
    "highUptake_molec",
    "highUptakeError_molec",
    "highUptakeError_mol",
    "heatDesorptionLow",
    "heatDesorptionLow_error",
    "lowUptake_molec",
    "lowUptakeError_molec",
    "lowUptake_mol",
    "lowUptakeError_mol",
    "surface_area",
    "del_capacity"
]

df.drop(target_variables, axis=1, inplace=True)

In [None]:
df_len = len(df)

df["id"] = df.index
best_row = df.nlargest(1, "highUptake_mol").index
great_rows = df.nlargest(15, "highUptake_mol")[5:].sample(5).index

best_rows = np.concatenate((best_row, great_rows))
additional_rows = np.random.choice(df.index.difference(best_rows), size=100 - len(best_rows), replace=False)
combined_indices = np.concatenate((best_rows, additional_rows))

df_test = df.loc[combined_indices]

# Save test dataset with ID and highUptake_mol columns
solution = df_test[["id", "highUptake_mol"]].copy()
solution["Usage"] = "Public"
solution.to_csv("./data/solution.csv", index=False)

# Save test dataset without the target column
df_test.drop("highUptake_mol", axis=1).to_csv("./data/test.csv", index=False)

# Generate a sample submission with random predictions within the target value range
minimum, maximum = df["highUptake_mol"].min(), df["highUptake_mol"].max()
sample_submission = df_test[["id"]].copy()
sample_submission["highUptake_mol"] = np.random.uniform(minimum, maximum, len(df_test))
sample_submission.to_csv("./data/sample_submission.csv", index=False)

# Remove selected rows from the original dataframe
df.drop(index=combined_indices, inplace=True)

# Sanity check to ensure no data loss
assert len(df) + len(df_test) == df_len

# Assert that all 'id' values in df are unique
assert df["id"].is_unique, "IDs in df are not unique"

# Assert that all 'id' values in df_test are unique
assert df_test["id"].is_unique, "IDs in df_test are not unique"

# Assert that there is no intersection of 'id' values between df and df_test
assert not set(df["id"]).intersection(set(df_test["id"])), "IDs in df and df_test intersect"

In [None]:
# Make sure you assign NaNs only to float64 columns
float64_columns = list(
    set(df.columns).difference(df.select_dtypes(exclude=["float64"]).columns.tolist())
)

# Select random indices (rows) to introduce NaNs
nan_indices = np.random.choice(df.index, size=int(0.125 * len(df)), replace=False)

# For each selected row, choose a random number of columns to assign NaNs
for idx in nan_indices:
    # Randomly choose the number of columns to assign NaNs (at least 1 column, at most all columns)
    num_nans = np.random.binomial(len(df.columns), 0.1) + 1  # Bias towards fewer NaNs

    # Randomly choose which columns to assign NaNs
    nan_columns = np.random.choice(float64_columns, size=num_nans, replace=False)

    # Assign NaNs to the selected columns of the current row
    df.loc[idx, nan_columns] = np.nan

# Display the first few rows of the DataFrame with NaNs
df.head()


In [None]:
df.to_csv("./data/train.csv", index=False)