In [53]:
%matplotlib inline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from ELLA import ELLA
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
from scipy.linalg import norm
from sklearn.metrics import mean_squared_error
import numpy as np
import os
import pandas as pd
from datetime import datetime

def multi_task_train_test_split(Xs,Ys,train_size=0.5):
    Xs_train = []
    Ys_train = []
    Xs_test = []
    Ys_test = []
    for t in range(len(Xs)):
        X_train, X_test, y_train, y_test = train_test_split(Xs[t], np.squeeze(Ys[t]), train_size=train_size)
        Xs_train.append(X_train)
        Xs_test.append(X_test)
        Ys_train.append(y_train)
        Ys_test.append(y_test)
    return Xs_train, Xs_test, Ys_train, Ys_test

def multi_task_KFold(Xs,Ys,n_splits=5):
    Xs_train = {}
    Ys_train = {}
    Xs_test = {}
    Ys_test = {}
    kf = KFold(n_splits=n_splits)
    for t in range(len(Xs)):
        for train_index, test_index in kf.split(Xs[t]):
            Xs_train.append(Xs[t][train_index])
            Xs_test.append(Xs[t][test_index])
            Ys_train.append(Ys[t][train_index])
            Ys_test.append(Ys[t][test_index])
    return Xs_train, Xs_test, Ys_train, Ys_test

def read_data(dataPath,dirName,module_channel,method):
    df_samples = pd.read_csv(os.path.join(dataPath,dir_name,dir_name+'_'+module_channel+'.csv'),sep=' ')
    df_method = pd.read_csv(os.path.join(dataPath,dir_name,method+'_'+module_channel+'.csv'),sep=' ')
    


In [59]:
fileDir = ['LBAmu30','LBAmu50','LBAmu90']
method = 'outOF'
module_channel = 'm_7_c_46'
dataPath = 'D:\\Scripts\\LPS\\data\\'
filter_param = 'amp'
n_splits = 10
tasks = {0}

# get the array with the lowest size
lower_size = np.inf
for dir_name in fileDir:
    df_samples = pd.read_csv(os.path.join(dataPath,dir_name,dir_name+'_'+module_channel+'.csv'),sep=' ')
    df_method = pd.read_csv(os.path.join(dataPath,dir_name,method+'_'+module_channel+'.csv'),sep=' ')
    if lower_size > df_samples.shape[0]:
        lower_size = df_samples.shape[0]
lower_size

input_samples = np.array([])
# read data
for dir_name in fileDir:
    df_samples = pd.read_csv(os.path.join(dataPath,dir_name,dir_name+'_'+module_channel+'.csv'),sep=' ', nrows=lower_size)
    df_method = pd.read_csv(os.path.join(dataPath,dir_name,method+'_'+module_channel+'.csv'),sep=' ', nrows=lower_size)
    print(dir_name)
    if input_samples.size == 0:
        input_samples = np.expand_dims(df_samples.values, axis=0)
        target_amp = np.expand_dims(df_method[filter_param].values, axis=0)
    else:
        input_samples = np.append(input_samples,np.expand_dims(df_samples.values, axis=0),axis=0)
        target_amp = np.append(target_amp,np.expand_dims(df_method[filter_param].values, axis=0),axis=0)

# split in train and test
Xs_train, Xs_test, Ys_train, Ys_test = multi_task_KFold(input_samples,target_amp,n_splits=n_splits)

# set up ELLA
T = len(fileDir)
d = df_samples.shape[1]
k = d # same latent as features
base_learner = Ridge(random_state=0) # LinearRegression, Ridge, or LogisticRegression

# create ELLA
model = ELLA(d,k,Ridge,mu=1,lam=10**-5) 

# train ELLA
for x in range(n_splits):
    for t in range(T):
        model.fit(Xs_train[t], Ys_train[t], t)
    
    # test ELLA    
    print("Average explained variance score", np.mean([model.score(Xs_test[t], Ys_test[t], t) for t in range(T)]))
    RMSE_error = [mean_squared_error(Ys_test[t], model.predict(Xs_test[t], t)) for t in range(T)]
    print("RMSE: ",RMSE_error)

LBAmu30
LBAmu50
LBAmu90
Average explained variance score 0.9999994663484998
RMSE:  [4.7196189206053006e-05, 5.0539496138606485e-06, 4.541935039586966e-06]
Average explained variance score 0.9999982881929966
RMSE:  [0.00018739304873389065, 9.487332849274088e-10, 7.282944607323084e-10]
Average explained variance score 0.999996158762499
RMSE:  [0.00041962288438899447, 1.0094752546362694e-09, 7.775121743908014e-10]


In [None]:
fileDir = ['LBAmu30','LBAmu50','LBAmu90']
taskName = fileDir
method = 'outOF'
module_channel = 'm_7_c_46'
dataPath = 'D:\\Scripts\\LPS\\data\\'
filter_param = 'amp'
n_splits = 10
version = '1'
input_data = {}
target_data = {}

# read data
lower_size = np.inf
for dir_name in fileDir:
    df_samples = pd.read_csv(os.path.join(dataPath,dir_name,dir_name+'_'+module_channel+'.csv'),sep=' ')
    df_method = pd.read_csv(os.path.join(dataPath,dir_name,method+'_'+module_channel+'.csv'),sep=' ')
    input_data[dir_name] = df_samples.values
    target_data[dir_name] = df_method.values

# set up ELLA
T = len(fileDir)
d = df_samples.shape[1]
k = d # same latent as features

kf = KFold(n_splits=n_splits)
kfold_nr=0
for train_index, test_index in kf.split(Xs[t]):
    model = ELLA(d,k,Ridge,mu=1,lam=10**-5)  # LinearRegression, Ridge, or LogisticRegression

    for t in taskName:
        Xs_train, Xs_test, Ys_train, Ys_test = train_test_split(input_data[t],target_data[t])
        model.fit(Xs_train[t], Ys_train[t], t)
    
    print("Average explained variance score", np.mean([model.score(Xs_test[t], Ys_test[t], t) for t in range(T)]))
    RMSE_error = [mean_squared_error(Ys_test[t], model.predict(Xs_test[t], t)) for t in range(T)]
    print("RMSE: ",RMSE_error)
    
    print('model_{}_t_{}_kfold_{}.pkl'.format(dt,task,kfold_nr))
        
    kfold_nr+=1
# create ELLA
model = ELLA(d,k,Ridge,mu=1,lam=10**-5) 

# train ELLA
for x in range(n_splits):
    for t in range(T):
        model.fit(Xs_train[t], Ys_train[t], t)
    
    # test ELLA    
    print("Average explained variance score", np.mean([model.score(Xs_test[t], Ys_test[t], t) for t in range(T)]))
    RMSE_error = [mean_squared_error(Ys_test[t], model.predict(Xs_test[t], t)) for t in range(T)]
    print("RMSE: ",RMSE_error)
    
#     dt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
#     with open('model_{}_t_{}_kfold_{}.pkl'.format(dt,task,kfold_nr), 'wb') as f:  # Python 3: open(..., 'wb')
#         pickle.dump([model, obj1, obj2], f)

In [60]:
for x in range(n_splits):
    print(x)

0
1
2
3
4
5
6
7
8
9


In [65]:
from datetime import datetime
dt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
dt

'2021-01-16 22:13:02'

In [34]:
np.append(np.expand_dims(df_samples.values, axis=0))

TypeError: _append_dispatcher() missing 1 required positional argument: 'values'

In [62]:
model.save()

AttributeError: 'ELLA' object has no attribute 'save'

In [57]:
input_samples = np.expand_dims(df_samples.values, axis=0)
input_samples = np.append(input_samples,np.expand_dims(df_samples.values, axis=0),axis=0)
input_samples = np.append(input_samples,np.expand_dims(df_samples.values, axis=0),axis=0)

In [74]:
Xs_train, Xs_test, Ys_train, Ys_test = multi_task_KFold(input_samples,target_amp)

In [69]:
input_samples[0].shape

(10422, 7)

array([-5.38918069, -2.64799956,  0.49014303, ..., -0.80053477,
        1.16354503,  7.57075783])