In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import glob
import re

In [2]:
def get_Data(csv, iteration = False):
    merged_dataset = pd.read_csv(csv)

    if(iteration):
        # merged_dataset = merged_dataset[merged_dataset["Iteration"] == 0]
        # merged_dataset = merged_dataset.drop(columns=["Type"])
        grouped_data = merged_dataset.groupby(['NaCl_ppm','MgSO4_ppm', "Iteration"])

    else:
        grouped_data = merged_dataset.groupby(['NaCl_ppm','MgSO4_ppm'])

    # Create a dictionary to store grouped data
    grouped_dict = {}
    # Iterate over groups and populate the dictionary
    for group_name, group_data in grouped_data:
        # print(group_name)
        tempDict = {}
        tempDF = group_data.drop(columns=['Iteration','Time','NaCl_ppm','MgSO4_ppm'])
        for col in tempDF.columns:
            vectorizedCol = tempDF[col].values
            tempDict[col] = vectorizedCol.copy()
        grouped_dict[group_name] = tempDict

    return grouped_dict



In [3]:
path = '../experiment_logs_UPDT/'
file = 'merged_experiments_fixed_iter_removed_q.csv'
file_path = path + file
grouped_dict = get_Data(file_path, iteration = True)


In [4]:
columns = pd.read_csv(file_path).columns


In [5]:
# Flattens the data
def arrangeData(dict):
    compArr = []
    for vals in list(dict.values()):
        arr = []
        for val in list(vals.values()):
            arr.extend(val.copy() - np.array(val).mean())
        compArr.append(arr.copy())
    
    return compArr



In [6]:
compArr = arrangeData(grouped_dict)

In [7]:
X = compArr
# X = np.array([[val for arr in list(vals.values()) for val in arr] ])

# Adjust the labels y to include both nacl_ppm and mgso4_ppm
y = [k[:2] for k in grouped_dict.keys()]



In [8]:

np.random.seed(0)

def kfold_train(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=np.random.randint(0, 100))
    models = []
    scores = []

    for train_index, test_index in kf.split(X):
        print(f"Running iteration {len(scores)}", end="\r")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        regressor = RandomForestRegressor(n_estimators=100, random_state=np.random.randint(0, 100))
        regressor.fit(X_train, y_train)

        y_pred = regressor.predict(X_test)
        error_t = (np.abs(y_test - y_pred)).mean(axis=0)
        y_pred_nacl = np.array([k[0] for k in y_pred])
        y_pred_mgso4 = np.array([k[1] for k in y_pred])
        y_test_nacl = np.array([k[0] for k in y_test])
        y_test_mgso4 = np.array([k[1] for k in y_test])
        salt_mse_t = mean_squared_error(y_test_nacl, y_pred_nacl)
        mgso_mse_t = mean_squared_error(y_test_mgso4, y_pred_mgso4)
        r2_nacl = r2_score(y_test_nacl, y_pred_nacl)
        re_mgso4 = r2_score(y_test_mgso4, y_pred_mgso4)
        scores.append({"Mean Error":error_t, "Salt MSE":salt_mse_t, "MgSO4 MSE":mgso_mse_t, "R^2 Salt":r2_nacl, "R^2 MgSO4":re_mgso4})
        models.append(regressor)
    scores = pd.DataFrame(scores)
    print(f"Kfold completed.                             ")
    return models, scores



In [9]:
def kfold_train_single_y(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=np.random.randint(0, 100))
    models = []
    scores = []

    for train_index, test_index in kf.split(X):
        print(f"Running iteration {len(scores)}", end="\r")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        regressor = RandomForestRegressor(n_estimators=100, random_state=np.random.randint(0, 100))
        regressor.fit(X_train, y_train)

        y_pred = regressor.predict(X_test)
        error_t = (np.abs(y_test - y_pred)).mean(axis=0)
        mse_t = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        scores.append({"Mean Error":error_t, "MSE":mse_t, "R^2":r2})
        models.append(regressor)
    scores = pd.DataFrame(scores)
    print(f"Kfold completed.                             ")
    return models, scores

In [10]:
def arrangeDataMultiColumn(dict,columns):
    compArr = []
    for vals in list(dict.values()):
        arr = []
        for col in columns:
            arr.extend(vals[col].copy() - np.array(vals[col]).mean())
        compArr.append(arr.copy())
    
    return compArr

In [11]:
#both
selected_column_X = arrangeDataMultiColumn(grouped_dict,["Ireal", "Iimag","Vreal", "Vimag"])
models_both ,score_both = kfold_train(np.array(selected_column_X), np.array(y), n_splits=5)

Kfold completed.                             


In [12]:
score_both.to_csv("Results/bothV.csv")

In [22]:
print(score_both["Mean Error"].mean())
print(score_both["R^2 Salt"].mean())
print(score_both["R^2 MgSO4"].mean())

[ 55.42257457 119.57492993]
0.9985207779947544
0.993187099959723


In [13]:
#only salt no mgso4 4 only Impedace, phase, resistance, conductance
selected_column_X = arrangeDataMultiColumn(grouped_dict,["Ireal", "Iimag","Vreal", "Vimag"])
models_only_salt ,score_only_salt = kfold_train_single_y(np.array(selected_column_X), np.array([k[0] for k in y]), n_splits=5)

Kfold completed.                             


In [14]:
score_only_salt.to_csv("Results/only_NaCl_V.csv")

In [25]:
score_only_salt.mean()

Mean Error       63.746908
MSE           69186.744206
R^2               0.998154
dtype: float64

In [15]:
#only salt no mgso4 4 only Impedace, phase, resistance, conductance
selected_column_X = arrangeDataMultiColumn(grouped_dict,["Ireal", "Iimag","Vreal", "Vimag"])
models_only_mg ,score_only_mg = kfold_train_single_y(np.array(selected_column_X), np.array([k[1] for k in y]), n_splits=5)

Kfold completed.                             


In [16]:
score_only_mg.to_csv("Results/only_MgSO4_V.csv")

In [26]:
score_only_mg.mean()

Mean Error       136.932213
MSE           499229.138777
R^2                0.985509
dtype: float64

In [17]:
#only salt given mgso4 4 only Impedace, phase, resistance, conductance
selected_column_X = arrangeDataMultiColumn(grouped_dict,["Ireal", "Iimag","Vreal", "Vimag"])
for i in range(len(selected_column_X)):
    selected_column_X[i].append(y[i][1])
models_nacl ,score_nacl = kfold_train_single_y(np.array(selected_column_X), np.array([k[0] for k in y]), n_splits=5)

Kfold completed.                             


In [18]:
score_nacl.to_csv("Results/NaCl_V.csv")

In [23]:
score_nacl.mean()

Mean Error       44.832370
MSE           51175.831156
R^2               0.998632
dtype: float64

In [19]:
#only mgso4 given nacl 4 only Impedace, phase, resistance, conductance
selected_column_X = arrangeDataMultiColumn(grouped_dict,["Ireal", "Iimag","Vreal", "Vimag"])
for i in range(len(selected_column_X)):
    selected_column_X[i].append(y[i][0])
models_mg ,score_mg = kfold_train_single_y(np.array(selected_column_X), np.array([k[1] for k in y]), n_splits=5)

Kfold completed.                             


In [20]:
score_mg.to_csv("Results/MgSO4_V.csv")

In [24]:
score_mg.mean()

Mean Error       114.762813
MSE           421900.935401
R^2                0.987874
dtype: float64