In [1]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
import scipy.stats as sp

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.svm import SVR
import itertools
from tqdm import tqdm

In [2]:
PATH = "../RS/time_series"

SVR_params = {
    "kernel" : ("linear", "rbf", "poly"),
    "C" :  [1e-4,1e-3,1e-2,1e-1,1,10,50,100,200,500,1000],
    "epsilon" : [1e-4,1e-3,1e-2,1e-2,1,5],
    "tol" : [1e-4,1e-3,1e-2,1e-2,1,10,50,100,200,500,1000],
    "gamma" : [1e-9, 1e-8, 1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1, 1]
}

RFParams = {
    "max_depth" : [2,5,10,15,30,35,40,50,70,90,100,200,500,700,1000,1500,2100],
    # "criterion" : (\"mse\", \"mae\", \"poisson\"),\n",
    "max_features" : ["log2", "sqrt", 0.1, 0.2, 0.3],
    # "min_samples_split\" : [2,3,4,5,7,10,15]\n",
}

In [3]:
class getData:
    def __init__(self, settings):
        self.path = PATH
        self.augment = settings["Augment"]
        self.path = PATH
        self.init()

    def sortFiles(self) -> None:
        # READING THE DATA,
        self.column_names = pd.read_csv(self.path + "/" + "ROIs.csv", header = None)
        self.files = [f for f in listdir(self.path) if isfile(join(self.path, f))]
        self.files.remove("ROIs.csv")
        self.files = np.sort(self.files) # file names for the data"

        # READING THE FILE TO GET PATIENT SCORES",
        real_df = pd.read_excel("../RS_bivariate_AQ_continuous.xlsx")
        rdf = real_df[["participant", "AQ"]]

        # EXTRACTING THE WAB SCORES OF THE PATIENTS
        parts = list(rdf["participant"])
        self.participants= parts[:30] + parts[31:32] + parts[35:51] + parts[53:60] + parts[61:63] + parts[30:31] # participant IDs"
        self.wabaq = list(rdf["AQ"])  # Participant scores"


    def noAugment(self) -> None:

        self.correlation_data = []
        self.outputs = []

        def modify_data():
            for index,data in enumerate(self.correlation_data):
                self.correlation_data[index] = data.values.flatten()
            self.correlation_data = np.array(self.correlation_data)
            self.outputs =  np.array(self.outputs)

        for patient_index, patient_file in enumerate(self.files):

            # READING THE PATIENT TIME SERIES FMRI DATA
            file = pd.read_csv(self.path + "/" + self.files[patient_index], header = None)
            file.columns = list(self.column_names[0])
            file = file.reindex(sorted(file.columns), axis = 1)

            # CALCULATE THE CORRELATION MATRIX\n",
            self.correlation_data.append(file.corr())
            self.outputs.append(self.wabaq[patient_index])

        modify_data()


    def data(self):
        return self.correlation_data, self.outputs

    def params(self, model):
        if model == "SVR":
            return


    def init(self):
        # GETTING THINGS READY TO GET THE DATA
        self.sortFiles()

        # GETTING THE DATA
        if self.augment:
            self.augment()
        else:
            self.noAugment()

    def describe(self):
        print('data shape = ', self.correlation_data.shape)
        print("output shape = ", self.outputs.shape)

In [15]:
class Model:
    def __init__(self, settings):
        self.cv = settings["Cross Validation Type"]
        self.metric = settings["metric"]
        self.m = settings["model"]
        if self.cv != "kTkV":
            self.model = self.get_model(settings["model"])
        self.pca = PCA(n_components=2)
        self.verbose = settings["verbose"]
        self.init(settings)

    def init(self, augmentation_settings): # gets the data to train and test
        GET = getData(augmentation_settings)
        self.data, self.outputs = GET.data()
        self.forward()


    def CV_(self):
        if self.cv == "LOO":
            return 55
        elif self.cv == "LFiveO":
            return 11
        elif self.cv == "5Fold":
            return 5
        elif self.cv == "11Fold":
            return 11


    def get_model(self, model, params=None):
        if model == "SVR":
            if self.cv != "kTkV":
                return GridSearchCV( SVR(), SVR_params, cv= self.CV_(), scoring = self.metric , return_train_score = True)
            else:
                return SVR(kernel=params[0], C= params[1], epsilon= params[2], tol=params[3], gamma=params[4])
            # return SVR()\n",

        if model == "RF":
            if self.cv != "kTkV":
                return GridSearchCV( RandomForestRegressor() , RFParams, verbose=3, cv = self.CV_(), scoring = self.metric, return_train_score = True)
            else:
                return RandomForestRegressor(max_depth=params[0], max_features=params[1])


    def train(self, data, outputs):
        self.model.fit(data, outputs)

    def predict(self, data):
        return self.model.predict(data)

    def get_score(self,data,outputs, return_ = False): #gives out R2 scores
        print("R2 fit score = ", self.model.score(data, outputs), "\n" )

    def CrossValResults(self):
        print(self.model.cv_results_)
        print('\n')

    def modify_data(self, data):
        self.pca.fit(data)
        raise NotImplementedError()

    def get_data(self):
        X_train, X_test, y_train, y_test = train_test_split( self.data, self.outputs, test_size=0.09, random_state=42)

        if self.cv != "kTkV":
            X_train, y_train = self.data, self.outputs

        print("train data shape = ", X_train.shape)
        print("train labels shape = ", y_train.shape, "\n")

        print("test data shape = ", X_test.shape)
        print("test labels shape = ", y_test.shape, "\n")

        return X_train, X_test, y_train, y_test

    def normal_cv(self):
        X_train, X_test, y_train, y_test = self.get_data()
        self.train(X_train, y_train)

        print("training performance")
        self.get_score(X_train, y_train)

        # print("test data performance")
        # self.get_score(X_test, y_test)

        self.train_predictions_ = self.predict(X_train)
        # self.test_predictions_ = self.predict(X_test)

    def validate(self, data, outputs):
        kf2 = KFold(n_splits=10)
        train_performance = []
        validate_performance = []
        models = []
        self.train_mean = 0

        for train_index, validate_index in kf2.split(data):
            self.model = self.get_model(self.m, self.curr_param)
            X_train, X_validate = data[train_index], data[validate_index]
            y_train, y_validate = outputs[train_index], outputs[validate_index]

            self.train_mean = np.mean(y_train)
            # mean_ones_ = np.ones(len(y_train))


            self.model.fit(X_train, y_train)
            train_predictions = self.model.predict(X_train)
            validate_predictions = self.model.predict(X_validate)

            models.append(self.model)
            train_performance.append(mean_absolute_error(train_predictions, y_train))
            validate_performance.append(mean_absolute_error(validate_predictions,y_validate))

        model = models[np.argmin(validate_performance)]
        return np.mean(train_performance), np.mean(validate_performance), model



    def train_kTkV(self):
        kf = KFold(n_splits=11)
        train_performances = []
        validate_performances = []
        test_performances = []
        models = []

        for train_index, test_index in kf.split(self.data):
            X_train, X_test = self.data[train_index], self.data[test_index]
            y_train, y_test = self.outputs[train_index], self.outputs[test_index]

            train_performance, validate_performance, model = self.validate(X_train, y_train)

            models.append(model)
            test_predictions = model.predict(X_test)
            test_performance = mean_absolute_error(test_predictions, y_test)

            train_performances.append(train_performance)
            validate_performances.append(validate_performance)
            test_performances.append(test_performance)

        model = models[np.argmin(test_performances)]
        return np.mean(train_performances), np.mean(validate_performances), np.mean(test_performances), model



    def kTkV(self):

        validate_performances = []
        test_performances = []
        train_performances = []
        models = []

        if self.m == "SVR":
            param_list = list(itertools.product(*SVR_params.values()))
        elif self.m == "RF":
            param_list = list(itertools.product(*RFParams.values()))

        for self.curr_param in tqdm(param_list, "params"):
            # self.model = self.get_model(self.m, param)
            train_performance, validate_performance, test_performance, model = self.train_kTkV()

            train_performances.append(train_performance)
            validate_performances.append(validate_performance)
            test_performances.append(test_performance)
            models.append(model)

        train_performance_ranking = sp.rankdata(train_performances)
        val_performance_ranking = sp.rankdata(validate_performances)
        test_performance_ranking = sp.rankdata(test_performances)

        self.dataframe = {
            "train performance" : train_performances,
            "test performance" : test_performances,
            "validate performance" : validate_performances,
            "train rank" : train_performance_ranking,
            "validate rank" : val_performance_ranking,
            "test rank" : test_performance_ranking,
            "params" : param_list
        }

        self.cv_results_ = pd.DataFrame.from_dict(self.dataframe)

    def forward(self):

        if self.cv != "kTkV":
            self.normal_cv()
        else:
            self.kTkV()


In [16]:
if __name__ == "__main__":

    Augmentation_settings = {
        "Cross Validation Type": "kTkV",
        "model" : "SVR",
        "metric" : "neg_mean_absolute_error",

        "Augment" : False,
        "transform" : True,

        "verbose" : "no"
    }

    model = Model(Augmentation_settings)

params: 100%|██████████| 21780/21780 [2:43:28<00:00,  2.22it/s]   


In [29]:
# cvr = pd.read_csv("kTkV_SVR_MAE.csv")
cvr = model.cv_results_
# np.min(cvr["test performance"])
cvr[cvr["test performance"] == 18.502649343653655]
# cvr[cvr["test rank"]==1]
# cvr[cvr["params"]==("poly", 1, 5, 0.0001, 0.1)]
# cvr.to_csv("kTkV_RF_MAE.csv")

Unnamed: 0,train performance,test performance,validate performance,train rank,validate rank,test rank,params
2020,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 1e-09)"
2021,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 1e-08)"
2022,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 1e-07)"
2023,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 1e-06)"
2024,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 1e-05)"
2025,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 0.0001)"
2026,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 0.001)"
2027,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 0.01)"
2028,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 0.1)"
2029,13.453114,18.502649,19.470552,5077.5,25.5,5.5,"(linear, 0.1, 0.0001, 1, 1)"


In [10]:
# cvr