In [None]:
import numpy as np
import os
import pandas as pd

from experimental_data.utils import set_components
from neqsim.thermo import TPflash
from sklearn.model_selection import StratifiedKFold
from typing import Dict

In [None]:
df = pd.read_csv(
    "experimental_data\\thermodinamically_processed_data.csv",
    index_col=False,
)
composition_data = df.loc[:, df.columns.str.contains("z")]

print(f"Dataset: {df.shape[0]} samples")
df.head()

In [None]:
samples_per_composition = 3

# TODO: Add output column names gas fraction (nV) and equiibrium ratios (K)
samples = pd.DataFrame(columns=df.columns.to_list() + ["P", "T", ...])

for i in np.arange(df.shape[0]):
    fluid1 = set_components(composition_data.loc[i, :].to_dict())

    # P_min = 10 bara   T_min = 150 K
    # P_max = 450 bara  T_max = 1125 K
    sample = []

    while len(sample) < samples_per_composition:
        P_sample = np.random.uniform(10, 450)
        T_sample = np.random.uniform(150, 1125)

        # TODO: Run flash calculations and obtain gas fraction and equilibrium 
        #       ratios to create a sample
        fluid1.setTemperature(T_sample, "K")
        fluid1.setPressure(P_sample, "bara")
        TPflash(fluid1)

        phases = [p for p in fluid1.getPhases() if p]
        phases_names = [phase.getPhaseTypeName() for phase in phases]

        if fluid1.getNumberOfPhases() == 2:
            if len(sample) < samples_per_composition:
                sample.append([T_sample, P_sample])

    new_samples = []

    for s in sample:
        # TODO: Create sample with calculations results
        sample_dict = df.iloc[i, :].to_dict()
        sample_dict["T"] = s[0]
        sample_dict["P"] = s[1]
        sample_dict["class"] = "oil"
        new_samples.append(sample_dict)

    samples = pd.concat([samples, pd.DataFrame.from_records(new_samples)])

samples

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)
folder_path = f"data\\experimental\\regression\\{samples_per_composition:03d}points\\"

if not os.path.isdir(folder_path):
    os.mkdir(folder_path)

samples = samples.sample(frac=1, ignore_index=True)
for i, (train_idx, test_idx) in enumerate(skf.split(samples, samples["class"])):
    train_size = train_idx.shape[0] - test_idx.shape[0]
    train_idx, valid_idx = train_idx[:train_size], train_idx[train_size:]

    print(f">>>> Fold {i+1} >>>>>>>>>>>>>>>>>>>>>>")
    print("train: ", train_idx.shape[0], ", valid: ", valid_idx.shape[0], ", test: ", test_idx.shape[0], sep="")
    train = samples.iloc[train_idx, :]
    valid = samples.iloc[valid_idx, :]
    test = samples.iloc[test_idx, :]
    print("train per class:")
    print(f'    {train[train["class"]=="oil"].shape[0]} (oil),', end=" ")
    print(f'{train[train["class"]=="gas"].shape[0]} (gas),', end=" ")
    print(f'{train[train["class"]=="mix"].shape[0]} (mix)')
    print("valid per class:")
    print(f'    {valid[valid["class"]=="oil"].shape[0]} (oil),', end=" ")
    print(f'{valid[valid["class"]=="gas"].shape[0]} (gas),', end=" ")
    print(f'{valid[valid["class"]=="mix"].shape[0]} (mix)')
    print("test per class:")
    print(f'    {test[test["class"]=="oil"].shape[0]} (oil),', end=" ")
    print(f'{test[test["class"]=="gas"].shape[0]} (gas),', end=" ")
    print(f'{test[test["class"]=="mix"].shape[0]} (mix)')

    print()
    samples.iloc[train_idx, :].to_csv(f"{folder_path}train_data_fold={i+1:02d}.csv", index=False)
    samples.iloc[valid_idx, :].to_csv(f"{folder_path}valid_data_fold={i+1:02d}.csv", index=False)
    samples.iloc[test_idx, :].to_csv(f"{folder_path}test_data_fold={i+1:02d}.csv", index=False)