In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew

import matplotlib.pyplot as plt


def subset_sampling(df, column="Yield", mean_target=30.0001, n_sample=500, mu=8, nu=2, seed=43):
    # Assuming you have data1 and data2 arrays as described
    data_sampled = []
    np.random.seed(seed)
    data = df[column]
    # Define the weights function to control the mean value
    range_min = 0
    range_max = 100
    # Calculate the weights based on distance from the mean and proximity to the desired range
    #weights = np.exp(-np.abs(data1 - mean_target)) * np.exp(-np.abs(data1 - range_min)) * np.exp(-np.abs(data1 - range_max)) * np.power(data1, 40)
    #weights = np.exp(-np.abs(data1 - mean_target))
    #weights = np.exp(-np.abs(data1 - mean_target)) * np.power(data1, 2)  # Adjust this based on your specific needs
    weights = np.power(np.abs(data - mean_target), -mu) * np.power(data, nu)
    #print(weights)
    # Normalize the weights to ensure they sum up to 1
    weights /= np.sum(weights)
    #print(weights)
    # Perform sampling using the probability distribution
    sampled_indices = np.random.choice(len(data), size=n_sample, replace=False, p=weights)
    df_sampled = df.iloc[sampled_indices]

    # Print the mean value of the sampled data
    mean_sampled = np.mean(df_sampled[column])
    print("Mean of Sampled Data: ", mean_sampled)

    # Print the skewness of the sampled data
    skewness_sampled = skew(df_sampled[column])
    print("Skewness of Sampled Data: ", skewness_sampled)

    # Print the median value of the sampled data
    median_sampled = np.median(df_sampled[column])
    print("Median of Sampled Data: ", median_sampled)
    return df_sampled, mean_sampled


In [None]:
df = pd.read_csv("bh_01.csv")
df

In [None]:
mean_target=83
df_sampled, mean_sampled = subset_sampling(df, column='Output', n_sample=500, mu=1, nu=2, mean_target=mean_target,  seed=46)
y_sampled = df_sampled["Output"]

In [None]:
# Plot the data distribution after sampling
plt.figure(figsize=(10, 5))
plt.hist(y_sampled, bins=30, density=True, alpha=0.5, label='Sampled Data')
plt.xlabel('Values')
plt.ylabel('Density')
plt.title('Data Distribution - After Sampling')
plt.legend()
plt.show()

In [None]:
df_sampled['smiles'] = df_sampled['Ligand'] + '.' + df_sampled['Additive'] + '.' + df_sampled['Base'] + '.' + df_sampled['Aryl_halide']
df_sampled_ = df_sampled[['smiles','Output']]

In [None]:
pd.DataFrame(df_sampled_).to_csv(f'fullcv_00.csv', index =None)