In [1]:
import numpy as np
import os
import pandas as pd

from experimental_data.utils import set_components, equilibrium_ratios
from neqsim.thermo import TPflash
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from typing import Dict

In [2]:
df = pd.read_csv(
    "experimental_data\\thermodinamically_processed_data.csv",
    index_col=False,
)
composition_data = df.loc[:, df.columns.str.startswith("z")]

print(f"Dataset: {df.shape[0]} samples")
df.head()

Dataset: 463 samples


Unnamed: 0,Field,Id,Date,FluidKind,LastFluidMolecularWeight,LastFluidSpecificGravity,zN2,zCO2,zC1,zC2,...,zC12,zC13,zC14,zC15,zC16,zC17,zC18,zC19,zC20,LastFluidComponent
0,Albacora,200588838534569195818022639230611277018,28/11/2013,OLEO,513.0,0.9375,0.28,2.18,50.34,8.8,...,1.39,1.42,1.19,1.12,0.84,0.75,0.74,0.71,7.54,C20
1,Albacora,235655066780487493060135334531738441741,28/11/2013,OLEO,513.0,0.9375,0.28,2.18,50.34,8.8,...,1.39,1.42,1.19,1.12,0.84,0.75,0.74,0.71,7.54,C20
2,Albacora,2850024621782551530469011068396203900,11/12/2014,OLEO,447.0,0.9332,0.36,3.23,45.32,7.16,...,1.35,1.4,1.13,1.07,0.83,0.68,0.74,0.67,10.99,C20
3,Albacora,66820533634321543631617927163864636933,11/12/2014,OLEO,447.0,0.9332,0.36,3.23,45.32,7.16,...,1.35,1.4,1.13,1.07,0.83,0.68,0.74,0.67,10.99,C20
4,Albacora,321567418088616429351396831258924766066,01/08/2007,OLEO,428.0,0.9508,0.01,0.17,42.67,5.08,...,1.44,1.48,1.3,1.3,1.0,0.86,0.84,0.75,24.57,C20


In [3]:
composition_data.head()

Unnamed: 0,zN2,zCO2,zC1,zC2,zC3,zIC4,zNC4,zIC5,zNC5,zC6,...,zC11,zC12,zC13,zC14,zC15,zC16,zC17,zC18,zC19,zC20
0,0.28,2.18,50.34,8.8,5.71,0.96,2.47,0.75,1.25,1.56,...,1.56,1.39,1.42,1.19,1.12,0.84,0.75,0.74,0.71,7.54
1,0.28,2.18,50.34,8.8,5.71,0.96,2.47,0.75,1.25,1.56,...,1.56,1.39,1.42,1.19,1.12,0.84,0.75,0.74,0.71,7.54
2,0.36,3.23,45.32,7.16,6.03,1.0,2.73,0.92,1.55,1.98,...,1.49,1.35,1.4,1.13,1.07,0.83,0.68,0.74,0.67,10.99
3,0.36,3.23,45.32,7.16,6.03,1.0,2.73,0.92,1.55,1.98,...,1.49,1.35,1.4,1.13,1.07,0.83,0.68,0.74,0.67,10.99
4,0.01,0.17,42.67,5.08,3.49,0.76,1.64,0.4,0.84,1.08,...,1.5,1.44,1.48,1.3,1.3,1.0,0.86,0.84,0.75,24.57


In [10]:
samples_per_composition = 300

new_samples = []
for i in tqdm(np.arange(df.shape[0])):
    fluid1 = set_components(composition_data.loc[i, :].to_dict())

    # P_min = 10 bara   T_min = 150 K
    # P_max = 450 bara  T_max = 1125 K
    composition_samples = []
    while len(composition_samples) < samples_per_composition:
        P_sample = np.random.uniform(10, 450)
        T_sample = np.random.uniform(150, 1125)

        fluid1.setTemperature(T_sample, "K")
        fluid1.setPressure(P_sample, "bara")
        TPflash(fluid1)

        phases = [p for p in fluid1.getPhases() if p]
        phases_names = [phase.getPhaseTypeName() for phase in phases]

        if fluid1.getNumberOfPhases() == 2:
            sample_dict = df.iloc[i, :].to_dict()
            outputs = equilibrium_ratios(fluid1)

            if any([v < 10e-15 for v in outputs.values()]):
                continue
            else:
                s = {"T": T_sample, "P": P_sample}
                s.update(outputs)
                sample_dict.update(s)
                composition_samples.append(sample_dict)
    new_samples.extend(composition_samples)

samples = pd.DataFrame.from_records(new_samples)
print(samples.shape)
samples.head()

  0%|          | 0/463 [00:00<?, ?it/s]

(138900, 58)


Unnamed: 0,Field,Id,Date,FluidKind,LastFluidMolecularWeight,LastFluidSpecificGravity,zN2,zCO2,zC1,zC2,...,K_C12,K_C13,K_C14,K_C15,K_C16,K_C17,K_C18,K_C19,K_C20,nV
0,Albacora,200588838534569195818022639230611277018,28/11/2013,OLEO,513.0,0.9375,0.28,2.18,50.34,8.8,...,0.116756,0.094722,0.076845,0.062526,0.052956,0.048944,0.034516,0.03514,0.023897,0.44278
1,Albacora,200588838534569195818022639230611277018,28/11/2013,OLEO,513.0,0.9375,0.28,2.18,50.34,8.8,...,0.410919,0.312461,0.235861,0.181226,0.1491,0.12277,0.084601,0.078114,0.050885,0.896454
2,Albacora,200588838534569195818022639230611277018,28/11/2013,OLEO,513.0,0.9375,0.28,2.18,50.34,8.8,...,0.114397,0.087294,0.066422,0.051047,0.04161,0.036037,0.023865,0.023274,0.014575,0.692022
3,Albacora,200588838534569195818022639230611277018,28/11/2013,OLEO,513.0,0.9375,0.28,2.18,50.34,8.8,...,0.270814,0.232219,0.19822,0.170787,0.152805,0.1379,0.111184,0.10724,0.084612,0.628105
4,Albacora,200588838534569195818022639230611277018,28/11/2013,OLEO,513.0,0.9375,0.28,2.18,50.34,8.8,...,0.165709,0.133418,0.106982,0.086664,0.073788,0.064937,0.047202,0.045665,0.031946,0.663461


In [11]:
K = samples.iloc[:, samples.columns.str.startswith("K")]
K.min(axis=None), K.max(axis=None)

(1.934209461782465e-11, 52.84650849429642)

In [12]:
kf = KFold(n_splits=10, shuffle=True, random_state=13)
folder_path = f"data\\experimental\\regression\\{samples_per_composition:03d}points\\"

if not os.path.isdir(folder_path):
    os.mkdir(folder_path)

samples = samples.sample(frac=1, ignore_index=True)
for i, (train_idx, test_idx) in enumerate(kf.split(samples)):
    train_size = train_idx.shape[0] - test_idx.shape[0]
    train_idx, valid_idx = train_idx[:train_size], train_idx[train_size:]

    print(f">>>> Fold {i+1} >>>>>>>>>>>>>>>>>>>>>>")
    print("train: ", train_idx.shape[0], ", valid: ", valid_idx.shape[0], ", test: ", test_idx.shape[0], sep="")
    train = samples.iloc[train_idx, :]
    valid = samples.iloc[valid_idx, :]
    test = samples.iloc[test_idx, :]
    
    print()
    samples.iloc[train_idx, :].to_csv(f"{folder_path}train_data_fold={i+1:02d}.csv", index=False)
    samples.iloc[valid_idx, :].to_csv(f"{folder_path}valid_data_fold={i+1:02d}.csv", index=False)
    samples.iloc[test_idx, :].to_csv(f"{folder_path}test_data_fold={i+1:02d}.csv", index=False)

>>>> Fold 1 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

>>>> Fold 2 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

>>>> Fold 3 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

>>>> Fold 4 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

>>>> Fold 5 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

>>>> Fold 6 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

>>>> Fold 7 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

>>>> Fold 8 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

>>>> Fold 9 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

>>>> Fold 10 >>>>>>>>>>>>>>>>>>>>>>
train: 111120, valid: 13890, test: 13890

