In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import glob
import re

In [2]:
def get_Data(csv, iteration = False):
    merged_dataset = pd.read_csv(csv)

    if(iteration):
        # merged_dataset = merged_dataset[merged_dataset["Iteration"] == 0]
        # merged_dataset = merged_dataset.drop(columns=["Type"])
        grouped_data = merged_dataset.groupby(['NaCl_ppm','MgSO4_ppm', "Iteration"])

    else:
        grouped_data = merged_dataset.groupby(['NaCl_ppm','MgSO4_ppm'])

    # Create a dictionary to store grouped data
    grouped_dict = {}
    # Iterate over groups and populate the dictionary
    for group_name, group_data in grouped_data:
        # print(group_name)
        tempDict = {}
        tempDF = group_data.drop(columns=['Iteration','Time','NaCl_ppm','MgSO4_ppm'])
        for col in tempDF.columns:
            vectorizedCol = tempDF[col].values
            tempDict[col] = vectorizedCol.copy()
        grouped_dict[group_name] = tempDict

    return grouped_dict



In [3]:
path = '../experiment_logs_UPDT/'
file = 'merged_experiments_fixed_iter_removed_q.csv'
file_path = path + file
grouped_dict = get_Data(file_path, iteration = True)


In [4]:
columns = pd.read_csv(file_path).columns


In [5]:
# Flattens the data
def arrangeData(dict):
    compArr = []
    for vals in list(dict.values()):
        arr = []
        for val in list(vals.values()):
            arr.extend(val.copy() - np.array(val).mean())
        compArr.append(arr.copy())
    
    return compArr



In [6]:
compArr = arrangeData(grouped_dict)

In [7]:
X = compArr
# X = np.array([[val for arr in list(vals.values()) for val in arr] ])

# Adjust the labels y to include both nacl_ppm and mgso4_ppm
y = [k[:2] for k in grouped_dict.keys()]



In [8]:

np.random.seed(0)

def kfold_train(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=np.random.randint(0, 100))
    models = []
    scores = []

    for train_index, test_index in kf.split(X):
        print(f"Running iteration {len(scores)}", end="\r")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        regressor = RandomForestRegressor(n_estimators=100, random_state=np.random.randint(0, 100))
        regressor.fit(X_train, y_train)

        y_pred = regressor.predict(X_test)
        error_t = (np.abs(y_test - y_pred)).mean(axis=0)
        y_pred_nacl = np.array([k[0] for k in y_pred])
        y_pred_mgso4 = np.array([k[1] for k in y_pred])
        y_test_nacl = np.array([k[0] for k in y_test])
        y_test_mgso4 = np.array([k[1] for k in y_test])
        salt_mse_t = mean_squared_error(y_test_nacl, y_pred_nacl)
        mgso_mse_t = mean_squared_error(y_test_mgso4, y_pred_mgso4)
        r2_nacl = r2_score(y_test_nacl, y_pred_nacl)
        re_mgso4 = r2_score(y_test_mgso4, y_pred_mgso4)
        scores.append({"Mean Error":error_t, "Salt MSE":salt_mse_t, "MgSO4 MSE":mgso_mse_t, "R^2 Salt":r2_nacl, "R^2 MgSO4":re_mgso4})
        models.append(regressor)
    scores = pd.DataFrame(scores)
    print(f"Kfold completed.                             ")
    return models, scores



In [9]:
def kfold_train_single_y(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=np.random.randint(0, 100))
    models = []
    scores = []

    for train_index, test_index in kf.split(X):
        print(f"Running iteration {len(scores)}", end="\r")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        regressor = RandomForestRegressor(n_estimators=100, random_state=np.random.randint(0, 100))
        regressor.fit(X_train, y_train)

        y_pred = regressor.predict(X_test)
        error_t = (np.abs(y_test - y_pred)).mean(axis=0)
        mse_t = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        scores.append({"Mean Error":error_t, "MSE":mse_t, "R^2":r2})
        models.append(regressor)
    scores = pd.DataFrame(scores)
    print(f"Kfold completed.                             ")
    return models, scores

In [10]:
def arrangeDataSingleColumn(dict,column):
    compArr = []
    for vals in list(dict.values()):
        compArr.append(vals[column] - np.array(vals[column]).mean())
    
    return compArr

In [11]:
resultDict = {}

for i, column in enumerate(columns):
    print(f"Processing {column} -- {i}/{len(columns)}                    ", end = '\r')
    if column == 'NaCl_ppm' or column == 'MgSO4_ppm' or column == 'Iteration' or column == 'Time':
        continue
    selected_column_X = arrangeDataSingleColumn(grouped_dict,column)
    np.random.seed(0)
    models, score = kfold_train(np.array(selected_column_X), np.array(y), n_splits=5)
    

    resultDict[column] = {'models': models, 'score': score}

resultDict = pd.DataFrame(resultDict)
resultDict.to_csv("single_columns.csv")
    
print(f"-- Process Complete --                                             ", end = '\r')

Kfold completed.                                
Kfold completed.                                
Kfold completed.                                     
Kfold completed.                                 
Kfold completed.                                
Kfold completed.                                 
Kfold completed.                                      
Kfold completed.                                  
Kfold completed.                                   
Kfold completed.                                         
Kfold completed.                                           
Kfold completed.                                        
Kfold completed.                                          
Kfold completed.                                   
Kfold completed.                             
Kfold completed.                             
Kfold completed.                             
Kfold completed.                             
Kfold completed.                             
Kfold completed.         

In [None]:
resultDict  resultDict.to_dict()

In [23]:
for key in resultDict.keys():
    # nacl_scores = np.array([k[0] for k in resultDict[key]['score']])
    # mgso_scores = np.array([k[1] for k in resultDict[key]['score']])
    # print(f"NaCl: {nacl_scores.mean()} -- MgSO4: {mgso_scores.mean()}")
    df = pd.DataFrame(resultDict[key]['score'])
    print(f"{key}: {df["Mean Error"].mean()}")

Hz: [5390.37800837 5096.56124719]
Impedance: [122.76573401 272.20993879]
ImpedancePhase: [194.19722668 340.40992859]
Resistance: [147.96691661 273.33666038]
Reactance: [117.57432235 239.14411288]
Admittance: [136.86848576 329.33434891]
AdmittancePhase: [193.98554162 342.78450431]
Conductance: [133.27552012 324.13849558]
Susceptance: [191.53790457 577.68555302]
SeriesCapacitance: [ 885.66419246 1363.10980881]
ParallelCapacitance: [5390.37800837 5096.56124719]
SeriesInductance: [156.88953854 255.03203756]
ParallelInductance: [ 508.46146465 1209.0362339 ]
Dissipation: [1144.78314796 2018.29590235]
Vrms: [157.47094017 344.03157293]
Vreal: [183.53145477 396.7782606 ]
Vimag: [ 95.39773971 229.00717319]
Irms: [1518.05786509 2773.33871976]
Ireal: [ 77.58177868 176.03360676]
Iimag: [193.95332784 355.97422545]
