In [30]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [31]:
def get_Data(csv, iteration = False):
    merged_dataset = pd.read_csv(csv)

    if(iteration):
        # merged_dataset = merged_dataset[merged_dataset["Iteration"] == 0]
        # merged_dataset = merged_dataset.drop(columns=["Type"])
        grouped_data = merged_dataset.groupby(['NaCl_ppm','MgSO4_ppm', "Iteration"])

    else:
        grouped_data = merged_dataset.groupby(['NaCl_ppm','MgSO4_ppm'])

    # Create a dictionary to store grouped data
    grouped_dict = {}
    # Iterate over groups and populate the dictionary
    for group_name, group_data in grouped_data:
        # print(group_name)
        tempDict = {}
        tempDF = group_data.drop(columns=['Iteration','Time','NaCl_ppm','MgSO4_ppm'])
        for col in tempDF.columns:
            vectorizedCol = tempDF[col].values
            tempDict[col] = vectorizedCol.copy()
        grouped_dict[group_name] = tempDict

    return grouped_dict

In [32]:
# Flattens the data
def arrangeData(dict):
    compArr = []
    for vals in list(dict.values()):
        arr = []
        for val in list(vals.values()):
            arr.extend(val.copy() - np.array(val).mean())
        compArr.append(arr.copy())
    
    return compArr


In [33]:
def arrangeDataMultiColumn(dict,columns):
    compArr = []
    for vals in list(dict.values()):
        arr = []
        for col in columns:
            arr.extend(vals[col].copy() - np.array(vals[col]).mean())
        compArr.append(arr.copy())
    
    return compArr

In [34]:
path = '../experiment_logs_UPDT/'
file = 'merged_experiments_fixed_iter.csv'
file_path = path + file
grouped_dict = get_Data(file_path, iteration = True)



In [35]:
# Flattening the input features to shape (num_samples, 21 * 201)
# num_samples = len(grouped_dict)
# print(num_samples)
X_flattened = arrangeDataMultiColumn(grouped_dict,["Ireal", "Iimag","Vreal", "Vimag"])  # shape will be (num_samples, 21*201)
print(len(X_flattened))
print(len(X_flattened[0]))

6354
804


In [36]:
X = X_flattened
# X = np.array([[val for arr in list(vals.values()) for val in arr] ])

# Adjust the labels y to include both nacl_ppm and mgso4_ppm
y = [k[:2] for k in grouped_dict.keys()]



In [37]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [38]:
# Train separate SVR models for each target
svr_nacl = SVR(kernel='rbf')  # Radial Basis Function kernel, commonly used for SVR
svr_mgso4 = SVR(kernel='rbf')

# Train the models
svr_nacl.fit(np.array(X_train), np.array([y[0] for y in y_train]))  # Training for NaCl ppm
svr_mgso4.fit(np.array(X_train), np.array([y[1] for y in y_train]))  # Training for MgSO4 ppm


In [43]:
# # Predict on the test set
# y_pred_nacl = svr_nacl.predict(X_test)
# y_pred_mgso4 = svr_mgso4.predict(X_test)

# # Combine predictions into a single array
# y_pred = np.vstack((y_pred_nacl, y_pred_mgso4)).T

from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 1],
    'gamma': ['scale', 'auto']
}

# Grid Search for NaCl
svr_nacl = SVR(kernel='rbf')
grid_search_nacl = GridSearchCV(svr_nacl, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search_nacl.fit(X_train, np.array([y[0] for y in y_train]))
best_svr_nacl = grid_search_nacl.best_estimator_

# Grid Search for MgSO4
svr_mgso4 = SVR(kernel='rbf')
grid_search_mgso4 = GridSearchCV(svr_mgso4, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search_mgso4.fit(X_train, np.array([y[1] for y in y_train]))
best_svr_mgso4 = grid_search_mgso4.best_estimator_

In [45]:
# Predict on the test set
y_pred_nacl = best_svr_nacl.predict(X_test)
y_pred_mgso4 = best_svr_mgso4.predict(X_test)

# Combine predictions into a single array
y_pred = np.vstack((y_pred_nacl, y_pred_mgso4)).T

In [40]:
len(y_test)

636

In [46]:
# Compute performance metrics for NaCl
mse_nacl = mean_squared_error(np.array([y[0] for y in y_test]), np.array(y_pred_nacl))
mae_nacl = mean_absolute_error(np.array([y[0] for y in y_test]), np.array(y_pred_nacl))

# Compute performance metrics for MgSO4
mse_mgso4 = mean_squared_error(np.array([y[1] for y in y_test]), np.array(y_pred_mgso4))
mae_mgso4 = mean_absolute_error(np.array([y[1] for y in y_test]), np.array(y_pred_mgso4))

print(f"NaCl - Mean Squared Error: {mse_nacl}, Mean Absolute Error: {mae_nacl}")
print(f"MgSO4 - Mean Squared Error: {mse_mgso4}, Mean Absolute Error: {mae_mgso4}")


NaCl - Mean Squared Error: 6226594.974878078, Mean Absolute Error: 1704.636217123388
MgSO4 - Mean Squared Error: 27112207.290460117, Mean Absolute Error: 3651.572724713985
