In [None]:
import numpy as np
import pandas as pd
import time
import pickle
from tqdm import tqdm
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor

In [None]:
data_names = ["enb", "edm", "slump", "scm1d"]
n_labels = [2, 2, 3, 16]

d = dict(zip(data_names, n_labels))

In [None]:
# Mention dataset name here
name = "scm1d"
data = arff.loadarff("./datasets/" + name + ".arff")
data = pd.DataFrame(data[0])

In [None]:
data = data.values

In [None]:
# Splitting train-test in ratio  80:20
num_targets = d[name]
train_data, test_data, train_labels, test_labels = train_test_split(
    data[:, :-num_targets], 
    data[:, -num_targets:], 
    test_size = 0.2,
    random_state = 0,
)

In [None]:
# Function to assess mse and cod
def performance(test_labels, test_labels_pred):
    err = []
    cod = []
    for i in range(test_labels.shape[1]):
        A = test_labels.T[i]
        B = test_labels_pred.T[i]

        err.append(mean_squared_error(A, B))
        cod.append(r2_score(A, B))

    return err, cod

In [None]:
# Function to build model
def MyModel(train_data, test_data, train_label, mod):
    if "cat" in mod:
        model = CatBoostRegressor(silent = True)
    elif "rf" in mod:
        model = RandomForestRegressor()
    elif "gb" in mod:
        model = GradientBoostingRegressor()
    
    # Fitting model
    model.fit(train_data, train_label)
    
    # Making prediction
    pred = model.predict(test_data) 

    return pred

In [None]:
# Function implementing proposed methodology
def PCA_Method(train_data, test_data, train_labels, test_labels, mod):
    # Start time
    start = time.time()
    
    # Obtain variance-covariance matrix
    cov_train_labels = np.cov(train_labels.T)
    # Obtain eigenvalues and eigenvectors
    eig_val_train_labels, eig_vec_train_labels = np.linalg.eig(cov_train_labels)
    eig_val_train_labels = (eig_val_train_labels/sum(eig_val_train_labels))*100
    
    # Sort eigenvalues and corresponding eigenvectors
    idx = eig_val_train_labels.argsort()[::-1]   
    eig_val_train_labels = eig_val_train_labels[idx]
    eig_vec_train_labels = eig_vec_train_labels[:, idx]
    
    # Enter the desired variance threshold
    threshold = 95
    # Obtain the number of eigenvalues corresponding to the threshold
    n = eig_val_train_labels.cumsum().searchsorted(threshold) 
    
    # Develop principal components for targets
    PC_train_labels = np.dot(train_labels, eig_vec_train_labels)
    PC_test_labels = np.dot(test_labels, eig_vec_train_labels)
    
    test_labels_pred = pd.DataFrame(columns = range(n))
    
    # Predict targets
    for i in range(n + 1):
        test_labels_pred[i] = MyModel(train_data, test_data, PC_train_labels.T[i], mod)

    # Fill remaining columsn with 0
    for i in range(n + 1, test_labels.shape[1]):
        test_labels_pred[i] = [0]*test_labels.shape[0]
        
    inv_eig_vec_train_labels = np.linalg.inv(eig_vec_train_labels) 
    test_labels_pred = np.dot(test_labels_pred, inv_eig_vec_train_labels)
    
    # End time
    end = time.time() - start

    # Obtain performance metrics - MSE and CoD
    err, cod = performance(test_labels, test_labels_pred) 
    
    return err, cod, n, eig_val_train_labels.tolist(), end

In [None]:
# Function to predict each and every target 
def Each_Method(train_data, test_data, train_labels, test_labels, mod):
    start = time.time()
    
    test_labels_pred = pd.DataFrame(columns = range(train_labels.shape[1]))
    
    # Making predictions for each target
    for i in range(train_labels.shape[1]):
        test_labels_pred[i] = MyModel(train_data, test_data, train_labels.T[i], mod)

    end = time.time() - start
        
    err, cod = performance(test_labels, test_labels_pred.T) 
    
    return err, cod, end

In [None]:
# Function to implement Multivariate Multiple Linear Regression
def MvLR_Method(train_data, test_data, train_labels, test_labels):
    # Start time
    start = time.time() 
    
    # Using simpler variables to avoid complications
    X = train_data
    Y = train_labels
 
    # Function to introduce a column of 1s
    def MakeCol(M):
        row = np.array([1]*M.shape[0])
        temp = np.vstack((M.T, row.T))
        M = temp.T
        return M
    
    # Introduce a column of 1s in training dataset
    # (Additional degree of freedom accounting for intercept)
    X = MakeCol(X)
    
    # Perform matrix multiplications to solve for min. B 
    # (Y = XB + E is the original equation)
    XT_X = np.dot(X.T, X)
    inv_XT_X = np.linalg.inv(XT_X)
    inv_XT_X_XT = np.dot(inv_XT_X, X.T)
    B = np.dot(inv_XT_X_XT, Y)
    
    # Transform test data like train data
    D = MakeCol(test_data)
    
    # Make prediction on train data
    test_labels_pred = np.dot(D, B)
    
    # End time
    end = time.time() - start
    
    # Obtain performance metrics - MSE and CoD
    err, cod = performance(test_labels, test_labels_pred) 
    
    return err, cod, end

In [None]:
summary = {}

models = ["cat", "rf", "gb"]

In [None]:
# Using proposed methodology
for mod in tqdm(models):
    err, cod, n, eig_val_train_labels, end = PCA_Method(
        train_data, 
        test_data, 
        train_labels, 
        test_labels, 
        "PC_" + mod
    )
    temp = {}
    temp["MSE"] = err
    temp["CoD"] = cod
    temp["n"] = n
    temp["eig_val"] =  eig_val_train_labels
    temp["Time"] = end
    summary["PC_" + mod] = temp

In [None]:
# Using Multivariate Multiple Regression Technique
err, cod, end = MvLR_Method(
    train_data, 
    test_data, 
    train_labels, 
    test_labels
)
temp = {}
temp["MSE"] = err
temp["CoD"] = cod
temp["Time"] = end
summary["MvLR"] = temp

In [None]:
# Using technique of predicting all targets
for mod in tqdm(models):
    err, cod, end = Each_Method(
        train_data, 
        test_data, 
        train_labels, 
        test_labels, 
        "Each_" + mod
    )
    temp = {}
    temp["MSE"] = err
    temp["CoD"] = cod
    temp["Time"] = end
    summary["Each_" + mod] = temp

In [None]:
# Saving results
pickle.dump(summary, open("summary_" + name + ".pickle", "wb"))