In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split
import sys

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler as mm_scaler
from sklearn.preprocessing import StandardScaler as std_scaler
from torchsummary import summary
import numpy as np
import copy
import glob
from collections import OrderedDict
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class CompData(Dataset):
    def __init__(self, X, y, train=True, scaler=True, task_num=3, num_sets=100, meta_train_batch=10, meta_test_batch=10, test_batch=32):
        self.task_num = task_num       
        self.scaler = scaler
        if train:
            self.meta_train_batch = meta_train_batch
            self.meta_test_batch = meta_test_batch
        else:
            self.test_batch = test_batch
        
        if scaler:
            X = std_scaler().fit_transform(X)
        
        ### create sets. support_x has 10000 sets of 5/ 25 images each. Total ~1000 sets (for 1000 iterations)
        ### create set with 32 rows (#meta_train_number) and 32 rows (#meta_test_number) and append to train and test lists
        #convert pandas dataframe to numpy array
        # print(type(X), type(y))
        #X = X.to_numpy()
        y = y.to_numpy()
        print(type(X), type(y))
        entire_data = np.column_stack((X,y))
        print(len(entire_data), print(len(entire_data[0])))
        #print('All data points:',entire_data)
        #num_rows = len(X) # for index sampling
        #self.selected_sample_indices = np.random.choice(num_rows,\
        #                                           size=task_num*(meta_train_batch+meta_test_batch),\
        #                                           replace=False)
        
        total_rows_required = num_sets*(meta_train_batch+meta_test_batch)
        # we get shuffled data so directly pick total rows required
        total_train_rows = entire_data[:total_rows_required,:]
        
        train_rows = total_train_rows[:num_sets*meta_train_batch,:]
        test_rows = total_train_rows[num_sets*meta_train_batch:,:]
        
        #train_rows = np.hsplit(train_rows, num_sets)
        #test_rows = np.hsplit(test_rows, num_sets)
        
        num_features = len(train_rows[0]) #### DEFINE
        print('**********************************************')
        ### final np arrays with data and runtimes for num_set rows 
        train_rows_data = train_rows[:, :num_features-1]
        train_rows_runtime = train_rows[:,num_features-1:]
        
        test_rows_data = test_rows[:,:num_features-1]
        test_rows_runtime = test_rows[:,num_features-1:]
        print(len(train_rows_data), len(train_rows_runtime))
        print(train_rows_data.shape, train_rows_runtime.shape)
        #print(train_rows_data, train_rows_runtime)
        print(len(test_rows_data), len(test_rows_runtime))
        print(test_rows_data.shape, test_rows_runtime.shape)
        #print(test_rows_data, test_rows_runtime)
        ## check
        train_rows_data = np.vsplit(train_rows_data, num_sets)
        train_rows_runtime = np.vsplit(train_rows_runtime, num_sets)
        test_rows_data = np.vsplit(test_rows_data, num_sets)
        test_rows_runtime = np.vsplit(test_rows_runtime, num_sets)
        
        #create sets here:
        final_sets = [] ## list of list. each list row will have train_rows_data/runtime, test_data/runtime
        for i in range(num_sets):
            temp = [train_rows_data[i]]+[train_rows_runtime[i]]+[test_rows_data[i]]+[test_rows_runtime[i]]
            final_sets.append(temp)
        self.final_sets = final_sets

    def __len__(self):
        return len(self.final_sets)

    def __getitem__(self,index):
                
        #zip sample without replacement from X
        temp_store = self.final_sets[index]
        train_row_data = temp_store[0]
        train_row_runtime = temp_store[1]
        test_row_data = temp_store[2]
        test_row_runtime = temp_store[3]
        
        if np.asarray(train_row_data) is train_row_data:
            train_row_data = np.asarray(train_row_data)
            train_row_runtime = np.asarray(train_row_runtime)
            test_row_data = np.asarray(test_row_data)
            test_row_runtime = np.asarray(test_row_runtime)
        
        ## convert numpy array to torch tensor
        #if not torch.is_tensor(X):
        train_row_data = train_row_data.astype(np.float32)
        train_row_runtime = train_row_runtime.astype(np.float32)
        test_row_data = test_row_data.astype(np.float32)
        test_row_runtime = test_row_runtime.astype(np.float32)
        
        train_row_data = torch.from_numpy(train_row_data)
        train_row_runtime = torch.from_numpy(train_row_runtime)
        test_row_data = torch.from_numpy(test_row_data)
        test_row_runtime = torch.from_numpy(test_row_runtime)
        #if not torch.is_tensor(y):
        #    self.train_y = torch.from_numpy(y)               
        
        
        return train_row_data, train_row_runtime, test_row_data, test_row_runtime
        

In [3]:
dr_columns = ['kernel','Compiler','Cluster','gpu_name','outer','inner','var_decl','ref_expr','int_literal','float_literal','mem_to',\
            'mem_from','add_sub_int','add_sub_double','mul_int','mul_double','div_int','div_double','assign_int','assign_double']
 #            'max_threads_per_sm','max_thread_blocks_per_sm','threads_per_wrap','max_wraps_per_sm','max_32-bit_registers_per_sm',\
 #             'max_registers_per_block','max_registers_per_thread','max_thread_block_size','l1','l2','l3']

dataset_root=""
df = pd.read_csv(dataset_root+"matrix_multiplication.csv")   
df = df.drop(columns=dr_columns)

#sys.exit("please check the dataset path and file names")
print(list(df.columns))
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

train_eval_split=0.8
split_seed=43

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_eval_split, random_state=split_seed, shuffle=True)

train_sets = CompData(X_train,y_train, scaler=True, train=True, task_num=3, num_sets=6, meta_train_batch=12, meta_test_batch=10)

# print(train_sets.__getitem__(0))
#test_sets = CompData(X_test, y_test, train=False, test_batch=32)

['clang', 'gcc', 'nvc', 'intel', 'thread_per_core', 'core_per_socket', 'num_sockets', 'cpu_clock', 'l1', 'l2', 'l3', 'connector_bandwidth', 'num_memory_bus', ' memory_clock', 'memory_bandwidth', ' memory_total', ' sm_clock', 'num_cores', 'compute_capability', 'threads_per_wrap', 'max_wraps_per_sm', 'max_threads_per_sm', 'max_thread_blocks_per_sm', 'max_32-bit_registers_per_sm', 'max_registers_per_block', 'max_registers_per_thread', 'max_thread_block_size', 'fp32_cores_per_sm', 'sm_registers_per_fp32_cores', 'shared_memory_size_per_sm', 'collapse', 'collapse_swap', 'combined', 'combined_swap', 'split', 'split_swap', 'reduction', 'log_outer', 'log_inner', 'log_var_decl', 'log_ref_expr', 'log_int_literal', 'log_float_literal', 'log_mem_to', 'log_mem_from', 'log_add_sub_int', 'log_add_sub_double', 'log_mul_int', 'log_mul_double', 'log_div_int', 'log_div_double', 'log_assign_int', 'log_assign_double', 'runtimes']


NameError: name 'CompData' is not defined

In [None]:
########### write parameter initialization in script:: kaiming or xavier; check c.b.finn's work
class OffloadModel(torch.nn.Module):
    def __init__(self, ip_features, num_hidden, op_features=1):
        super(OffloadModel, self).__init__()
        
        self.mod1 = nn.Sequential(OrderedDict([
            ('lin1', nn.Linear(ip_features, num_hidden)),
            ('relu1', nn.ReLU())
        ]))
        self.mod2 = nn.Sequential(OrderedDict([
            ('lin2', nn.Linear(num_hidden,num_hidden*2)),
            ('relu2', nn.ReLU()),
            ('drop1', nn.Dropout(p=0.25)),          
            ('lin3', nn.Linear(num_hidden*2, num_hidden)),
            ('relu3', nn.ReLU())
           
        ]))
        self.mod3 = nn.Sequential(OrderedDict([
            ('lin4', nn.Linear(num_hidden, num_hidden)),
            ('relu4', nn.ReLU()),
            ('drop2', nn.Dropout(p=0.25)),
            ('lin5', nn.Linear(num_hidden,op_features)),
            ('relu5', nn.ReLU())             
        ]))
    
    def forward(self, x):
        op = self.mod1(x)
        x = self.mod2(op)
        x += op
        x = self.mod3(x)
        return x
    
    def var_forward(self, x, weights):
        op = F.linear(x, weights[0], weights[1])
        op = F.relu(op)
        x = F.relu(F.linear(op, weights[2], weights[3]))
        x = F.dropout(x, p=0.25)
        x = F.relu(F.linear(x, weights[4], weights[5]))
        x += op
        x = F.relu(F.linear(x, weights[6], weights[7]))
        x = F.dropout(x, p=0.25)
        #x = F.linear(x, weights[8], weights[9])
        x = F.relu(F.linear(x, weights[6], weights[7]))
        return x

In [None]:
#dls = DataLoader(train_sets, batch_size=4)

In [None]:
update_factor = 0.1
def train(model, x_train, y_train, x_test, y_test):
    original_model_copy = copy.deepcopy(model)
    loss_tasks = 0
    for k in range(task_num):
        print(k)
        temp_weights=[w.clone() for w in list(original_model_copy.parameters())]
        
        outputs = original_model_copy.var_forward(x_train[k], temp_weights)
        loss = criterion(outputs, y_train[k])
        #print(type(y_train[k]))
        #print(loss, type(loss))
        #print(type(loss), type(temp_weights))
        grad = torch.autograd.grad(loss, temp_weights)
        # temporary update weights 
        temp_weights = [w - update_factor*g for w,g in zip(temp_weights, grad)]
        
        ## run updated weights on meta-test batch
        new_outputs = original_model_copy.var_forward(x_test[k], temp_weights)
        new_loss = criterion(new_outputs, y_test[k])
        
        loss_tasks += new_loss
    
    return loss_tasks

In [None]:
outer_epochs = 10
task_num = 3
global_model = OffloadModel(53,106)
meta_optim = torch.optim.Adam(global_model.parameters(), lr=1e-3)
#### add some lr decay: cosine or step or lambda
global_model = global_model.to(device)
criterion = nn.MSELoss()

for idx in range(outer_epochs):
    train_set_loader = DataLoader(train_sets, batch_size=task_num, drop_last=True)
    for i, (x_train, y_train, x_test, y_test) in enumerate(train_set_loader):
        # print((x_train[0]))
        task_num_, set_size, cols = x_train.shape #<--verify
        #print(task_num_, set_size, cols)
        x_train, y_train, x_test, y_test = x_train.to(device), y_train.to(device), x_test.to(device), y_test.to(device)
        
        # print(type(x_train))
        ### train should return and accuracies???
        total_loss = train(global_model, x_train, y_train, x_test, y_test) #<-- returns loss
        
        meta_optim.zero_grad()
        total_loss.backward()
        meta_optim.step()
    
        ### do some validation or call the actual test function? (diff data set & loader)
        ### no grad calculation in whatever that happens next

In [None]:
update_factor = 0.01
update_steps = 5
def new_train(model, x_train, y_train, x_test, y_test):
    original_model_copy = copy.deepcopy(model)
    #loss_tasks = 0
    losses_q = [0 for _ in range(update_steps + 1)]
    
    for k in range(task_num):
        print('task_num:', k)
        temp_weights=[w.clone() for w in list(original_model_copy.parameters())]
        
        outputs = original_model_copy.var_forward(x_train[k], temp_weights)
        loss = criterion(outputs, y_train[k])

        grad = torch.autograd.grad(loss, temp_weights)
        
        #fast_weights = list(map(lambda p: p[1]-update_factor*p[0], zip(grad, original_model_copy.parameters())))
        fast_weights = [w-update_factor*g for w,g in zip(temp_weights, grad)]
        
        # temp_weights = [w - update_factor*g for w,g in zip(temp_weights, grad)]
        #with torch.no_grad():
        new_output = original_model_copy.var_forward(x_test[k], fast_weights)
        new_loss = criterion(new_output, y_test[k])
        losses_q[0] += new_loss
        
        for j in range(update_steps):
            ops = original_model_copy.var_forward(x_train[k], fast_weights)
            loss = criterion(ops, y_train[k])
            
            grad = torch.autograd.grad(loss, fast_weights, create_graph=True, allow_unused=True)
            #print(type(fast_weights), len(fast_weights))
            #print(type(grad), len(grad))
            #print(fast_weights)
            print(any(elem is None for elem in fast_weights))
            #for h in range(len(fast_weights)):
            #    if fast_weights[h] is None:
            #        print('found none type here', h)
            fast_weights = list(map(lambda p: p[1]-update_factor*p[0], zip(grad, fast_weights)))
            
            new_ops = offload_model_copy.var_forward(x_test[k], fast_weights)
            loss_q = criterion(new_ops, y_test[k])
            losses_q[j+1] += loss_q
        
    loss_last = losses_q[-1] / task_num
    

    
    return loss_last 

In [None]:
outer_epochs = 5
task_num = 3
global_model = OffloadModel(42,30)

#def init_weights(m):
#    if isinstance(m, nn.Linear):
#        torch.nn.init.kaiming_uniform_(m.weight)
#        m.bias.data.fill_(0.01)

#global_model.apply(init_weights)
#
#global_model

In [None]:
meta_optim = torch.optim.Adam(global_model.parameters(), lr=1e-3)
#### add some lr decay: cosine or step or lambda
global_model = global_model.to(device)
criterion = nn.MSELoss()

for idx in range(100):
    episode_loss = 0
    train_set_loader = DataLoader(train_sets, batch_size=task_num, drop_last=True)
    # print(len(train_set_loader))
    for i, (x_train, y_train, x_test, y_test) in enumerate(train_set_loader):
        # print((x_train[0]))
        # task_num_, set_size, cols = x_train.shape #<--verify
        #print(task_num_, set_size, cols)
        x_train, y_train, x_test, y_test = x_train.to(device), y_train.to(device), x_test.to(device), y_test.to(device)
        # print(type(x_train))
        
        meta_loss = train(global_model, x_train, y_train, x_test, y_test) #<-- returns loss
        #print(meta_loss)
        meta_optim.zero_grad()
        meta_loss.backward()
        meta_optim.step()
        episode_loss += meta_loss.item()
    print('Loss:', np.sqrt(episode_loss))
    ### do some validation or call the actual test function? (diff data set & loader)
    ### no grad calculation in whatever that happens next

In [None]:
'''
#### Added to dataset_other on origin-local
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler as mm_scaler
import numpy as np



class CompData(Dataset):
    def __init__(self, X, y, train=True, scaler=True, task_num=3, num_sets=100. meta_train_batch=10, meta_test_batch=10, test_batch=32):
        self.task_num = task_num
        self.scaler = scaler
        if train:
            self.meta_train_batch = meta_train_batch
            self.meta_test_batch = meta_test_batch
        else:
            self.test_batch = test_batch

        if scaler:
            X = mm_scaler().fit_transform(X)

        ### create sets. support_x has 10000 sets of 5/ 25 images each. Total ~1000 sets (for 1000 iterations)
        ### create set with 32 rows (#meta_train_number) and 32 rows (#meta_test_number) and append to train and test lists
        #convert pandas dataframe to numpy array
        X = X.to_numpy()
        y = y.to_numpy()

        entire_data = np.hstack((X,y))

        #num_rows = len(X) # for index sampling
        #self.selected_sample_indices = np.random.choice(num_rows,\
        #                                           size=task_num*(meta_train_batch+meta_test_batch),\
        #                                           replace=False)

        total_rows_required = num_sets*(meta_train_batch+meta_test_batch)
        # we get shuffled data so directly pick total rows required
        total_train_rows = entire_data[:total_rows_required]

        train_rows = total_train_rows[:num_sets*meta_train_batch]
        test_rows = total_train_rows[num_threats*meta_train_batch:]

        train_rows = np.hsplit(train_rows, num_sets)
        test_rows = np.hsplit(test_rows, num_sets)

        num_features = len(train_rows[0])#### DEFINE

        ### final np arrays with data and runtimes for num_set rows
        train_rows_data = train_rows[:, :num_features-1]
        train_rows_runtime = train_rows[:,num_features-1:]

        test_rows_data = test_rows[:,:num_features-1]
        test_rows_runtime = test_rows[:,num_features-1:]
        #create sets here:
        final_sets = [] ## list of list. each list row will have train_rows_data/runtime, test_data/runtime
        for j in range(num_tasks):
            tr_row_data = list()
            tr_row_run = list()
            te_row_data = list()
            te_row_run - list()
            for i in range(num_sets):
                tr_row_data.append(train_rows_data[i])
                tr_row_run.append(train_rows_runtime[i])
                te_row_data.append(test_rows_data[i])
                te_row_run.append(test_rows_runtime[i])

            temp = [tr_row_data]+[tr_row_run]+[te_row_data]]+[te_row_run]
            final_sets.append(temp)

        self.final_sets = final_sets

    def __len__(self):
        return len(self.final_sets)

    def __getitem__(self,index):

        #zip sample without replacement from X
        #for i in range(self.num_tasks):
        train_row_data = self.final_sets[index,:,0]
        train_row_runtime = self.final_sets[index,:,1]
        test_row_data = self.final_sets[index,:,2]
        test_row_runtime = self.final_set[index,:,3]

        #for i in range(self.num_tasks):
        #if np.asarray(train_row_data) is train_row_data:
        train_row_data = np.asarray(train_row_data)
        train_row_runtime = np.asarray(train_row_runtime)
        test_row_data = np.asarray(test_row_data)
        test_row_runtime = np.asarray(test_row_runtime)

        ## convert numpy array to torch tensor
        #if not torch.is_tensor(X):
        train_row_data = torch.from_numpy(train_row_data)
        train_row_runtime = torch.from_numpy(train_row_runtime)
        test_row_data = torch.from_numpy(test_row_data)
        test_row_runtime = torch.from_numpy(test_row_runtime)
        #if not torch.is_tensor(y):
        #    self.train_y = torch.from_numpy(y)


        return train_row_data, train_row_runtime, test_row_data, test_row_runtime



'''

In [3]:
class PrepareData(Dataset):

    def __init__(self, X, y, scale_X=True):
        if not torch.is_tensor(X):
            if scale_X:
                X = std_scaler().fit_transform(X)
                self.X = torch.from_numpy(X)
        if not torch.is_tensor(y):
            self.y = torch.from_numpy(y.to_numpy())

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [4]:
dr_columns = ['kernel','Compiler','Cluster','gpu_name','outer','inner','var_decl','ref_expr','int_literal','float_literal','mem_to',\
            'mem_from','add_sub_int','add_sub_double','mul_int','mul_double','div_int','div_double','assign_int','assign_double']
#             'max_threads_per_sm','max_thread_blocks_per_sm','threads_per_wrap','max_wraps_per_sm','max_32-bit_registers_per_sm',\
#              'max_registers_per_block','max_registers_per_thread','max_thread_block_size','l1','l2','l3']

dataset_root=""
df = pd.read_csv(dataset_root+"jacobi.csv")
df2 = pd.read_csv(dataset_root+"bfs.csv")
df3 = pd.read_csv(dataset_root+"gauss.csv")
df4 = pd.read_csv(dataset_root+"particle_filter.csv")
#all_files = glob.glob(dataset_root + "*.csv")
#all_files.remove('dataset.csv') ## removing non-app-specific file
#all_files.remove('jacobi.csv') ## test with jacobi
#temp_list = list()
#for file_n in all_files:
#    df = pd.read_csv(file_n)
#    temp_list.append(df)

single = pd.concat([df,df2,df3], axis=0)
single = single.drop(columns=dr_columns)
#sys.exit("please check the dataset path and file names")

print(list(single.columns))
print(len(single))
print(len(list(single.columns)))
X = single.iloc[:, 0:-1]
y = single.iloc[:, -1]

train_eval_split=0.8
split_seed=43

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_eval_split, random_state=split_seed, shuffle=True)

#train_sets = CompData(X_train,y_train, scaler=True,train=True, task_num=3, num_sets=6, meta_train_batch=12, meta_test_batch=10)
train_sets = PrepareData(X_train, y_train)
#

['clang', 'gcc', 'nvc', 'intel', 'thread_per_core', 'core_per_socket', 'num_sockets', 'cpu_clock', 'l1', 'l2', 'l3', 'connector_bandwidth', 'num_memory_bus', ' memory_clock', 'memory_bandwidth', ' memory_total', ' sm_clock', 'num_cores', 'compute_capability', 'threads_per_wrap', 'max_wraps_per_sm', 'max_threads_per_sm', 'max_thread_blocks_per_sm', 'max_32-bit_registers_per_sm', 'max_registers_per_block', 'max_registers_per_thread', 'max_thread_block_size', 'fp32_cores_per_sm', 'sm_registers_per_fp32_cores', 'shared_memory_size_per_sm', 'collapse', 'collapse_swap', 'combined', 'combined_swap', 'split', 'split_swap', 'reduction', 'log_outer', 'log_inner', 'log_var_decl', 'log_ref_expr', 'log_int_literal', 'log_float_literal', 'log_mem_to', 'log_mem_from', 'log_add_sub_int', 'log_add_sub_double', 'log_mul_int', 'log_mul_double', 'log_div_int', 'log_div_double', 'log_assign_int', 'log_assign_double', 'runtimes']
3403
54


In [5]:
import torch
import torch.nn
import torch.nn.functional as F

class KernelRunModel(torch.nn.Module):
    def __init__(self, ip_features, num_hidden, op_features=1):
        super(KernelRunModel, self).__init__()

        self.hidden1 = torch.nn.Linear(ip_features, num_hidden)
        self.hidden2 = torch.nn.Linear(num_hidden, num_hidden*2)
        self.hidden3 = torch.nn.Linear(num_hidden*2, num_hidden)
        self.hidden4 = torch.nn.Linear(num_hidden, num_hidden)
        self.op_run = torch.nn.Linear(num_hidden, op_features)
        self.dropout = nn.Dropout(p=0.2)
    
    def forward(self, x):
        op = F.relu(self.hidden1(x))
        residual = op
        x = F.relu(self.hidden2(op))
        x = self.dropout(x)
        x = F.relu(self.hidden3(x))
        x += residual
        x = F.relu(self.hidden4(x))
        x = self.dropout(x)
        x = F.relu(self.op_run(x))
        return x

In [None]:
mod = KernelRunModel(53,106).to(device)
summary(mod, (10,53))

In [7]:
from torch.autograd import Variable

mod = KernelRunModel(53,106).to(device)
criterion = nn.MSELoss()
#criterion2 = nn.L1Loss()
opt = torch.optim.Adam(mod.parameters(), lr=2e-5)
for e in range(100):
    batch_losses = []

    for ix, (Xb, yb) in enumerate(train_sets):

        _X = Variable(Xb).float()
        _y = Variable(yb).float()

        #==========Forward pass===============
        _X = _X.to(device)
        _y = _y.to(device)
        preds = mod(_X)
        loss = criterion(preds, _y)
        #loss2 = criterion2(preds, _y)
        total_loss = loss/0.1 # + loss2
        #==========backward pass==============

        opt.zero_grad()
        total_loss.backward()
        opt.step()

        batch_losses.append(loss.item())
        #all_losses.append(loss.data[0])

    mbl = np.mean(np.sqrt(batch_losses)).round(3)

    if e % 1 == 0:
        print("Epoch [{}/{}], Batch loss: {}".format(e, 50, mbl))

Epoch [0/50], Batch loss: 3.23
Epoch [1/50], Batch loss: 2.859
Epoch [2/50], Batch loss: 2.713
Epoch [3/50], Batch loss: 2.705
Epoch [4/50], Batch loss: 2.723
Epoch [5/50], Batch loss: 2.687
Epoch [6/50], Batch loss: 2.633
Epoch [7/50], Batch loss: 2.622
Epoch [8/50], Batch loss: 2.645
Epoch [9/50], Batch loss: 2.582
Epoch [10/50], Batch loss: 2.601
Epoch [11/50], Batch loss: 2.58
Epoch [12/50], Batch loss: 2.525
Epoch [13/50], Batch loss: 2.498
Epoch [14/50], Batch loss: 2.522
Epoch [15/50], Batch loss: 2.545
Epoch [16/50], Batch loss: 2.525
Epoch [17/50], Batch loss: 2.514
Epoch [18/50], Batch loss: 2.462
Epoch [19/50], Batch loss: 2.494
Epoch [20/50], Batch loss: 2.461
Epoch [21/50], Batch loss: 2.502
Epoch [22/50], Batch loss: 2.41
Epoch [23/50], Batch loss: 2.441
Epoch [24/50], Batch loss: 2.394
Epoch [25/50], Batch loss: 2.414
Epoch [26/50], Batch loss: 2.392
Epoch [27/50], Batch loss: 2.44
Epoch [28/50], Batch loss: 2.386
Epoch [29/50], Batch loss: 2.405
Epoch [30/50], Batch los

In [8]:
test_sets = PrepareData(X_test, y_test)
df3 = pd.read_csv(dataset_root+"matrix_multiplication.csv")
df3 = df3.drop(columns=dr_columns)
X_t = df3.iloc[:, 0:-1]
y_t = df3.iloc[:, -1]
test_sets_ = PrepareData(X_t, y_t)
mod.eval()
with torch.no_grad():
    total_loss = 0
    gt_ = list()
    preds_ = list()
    for index, (xt, yt) in enumerate(test_sets_):
        gt_.append(yt.cpu().data.numpy())
        
        _xt = Variable(xt).float()
        _yt = Variable(yt).float()
        
        _xt = _xt.to(device)
        _yt = _yt.to(device)
        
        predictions = mod(_xt)
        loss1 = criterion(predictions, _yt)
        preds_.append(predictions.cpu().data.numpy())
        print(predictions, _yt)
        total_loss += loss1
    
    mape = mean_absolute_percentage_error(gt_, preds_)
    rmse = np.sqrt(mean_squared_error(gt_, preds_))
    print('Test Loss: ', np.mean(np.sqrt(total_loss.item())))
    print('RMSE: ', rmse, ' MAPE:', mape)

  return F.mse_loss(input, target, reduction=self.reduction)


tensor([0.], device='cuda:0') tensor(1.0422, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.0578, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.1055, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.1819, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.2232, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.2639, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.3103, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.3505, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.4004, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.4490, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.5115, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.5465, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.5880, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.6132, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.6600, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.7447, device='cuda:0')
tensor([

tensor([0.], device='cuda:0') tensor(0.5275, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5342, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5527, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5679, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5799, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5899, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.6047, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.6220, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.6456, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.6626, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.6912, device='cuda:0')
tensor([1.0304], device='cuda:0') tensor(0.7223, device='cuda:0')
tensor([1.7438], device='cuda:0') tensor(0.7506, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.7704, device='cuda:0')
tensor([1.0483], device='cuda:0') tensor(0.7918, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.8187, device='cuda:

tensor([33.6549], device='cuda:0') tensor(147.5256, device='cuda:0')
tensor([33.0996], device='cuda:0') tensor(148.1429, device='cuda:0')
tensor([35.4450], device='cuda:0') tensor(160.8964, device='cuda:0')
tensor([35.0529], device='cuda:0') tensor(161.7510, device='cuda:0')
tensor([34.1815], device='cuda:0') tensor(162.2216, device='cuda:0')
tensor([39.3558], device='cuda:0') tensor(244.2849, device='cuda:0')
tensor([40.7279], device='cuda:0') tensor(263.0056, device='cuda:0')
tensor([41.8853], device='cuda:0') tensor(281.8054, device='cuda:0')
tensor([40.6966], device='cuda:0') tensor(282.5639, device='cuda:0')
tensor([41.2013], device='cuda:0') tensor(283.3093, device='cuda:0')
tensor([41.3541], device='cuda:0') tensor(284.0239, device='cuda:0')
tensor([41.2419], device='cuda:0') tensor(300.5657, device='cuda:0')
tensor([42.9832], device='cuda:0') tensor(300.7433, device='cuda:0')
tensor([42.3562], device='cuda:0') tensor(303.4563, device='cuda:0')
tensor([41.9371], device='cuda:0')

tensor([0.], device='cuda:0') tensor(1.1181, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.1981, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.3300, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.4646, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.5966, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.8085, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.0457, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.3518, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.4875, device='cuda:0')
tensor([0.], device='cuda:0') tensor(4.2777, device='cuda:0')
tensor([0.], device='cuda:0') tensor(4.5073, device='cuda:0')
tensor([0.], device='cuda:0') tensor(4.8142, device='cuda:0')
tensor([0.], device='cuda:0') tensor(5.0246, device='cuda:0')
tensor([0.], device='cuda:0') tensor(5.2107, device='cuda:0')
tensor([0.], device='cuda:0') tensor(5.3829, device='cuda:0')
tensor([0.], device='cuda:0') tensor(5.5126, device='cuda:0')
tensor([

tensor([0.], device='cuda:0') tensor(2.1605, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.3387, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.4224, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.5200, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.6122, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.7154, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.7945, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.8545, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.9617, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.0538, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.1399, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.1826, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.2475, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.3000, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.3534, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.4076, device='cuda:0')
tensor([

tensor([29.7460], device='cuda:0') tensor(106.8093, device='cuda:0')
tensor([25.0352], device='cuda:0') tensor(106.8794, device='cuda:0')
tensor([31.1807], device='cuda:0') tensor(107.0970, device='cuda:0')
tensor([29.2737], device='cuda:0') tensor(107.9148, device='cuda:0')
tensor([28.0172], device='cuda:0') tensor(113.6216, device='cuda:0')
tensor([30.1953], device='cuda:0') tensor(117.1856, device='cuda:0')
tensor([35.2892], device='cuda:0') tensor(151.6696, device='cuda:0')
tensor([39.7101], device='cuda:0') tensor(159.5508, device='cuda:0')
tensor([36.8640], device='cuda:0') tensor(160.5494, device='cuda:0')
tensor([38.3332], device='cuda:0') tensor(161.0938, device='cuda:0')
tensor([35.6434], device='cuda:0') tensor(163.3420, device='cuda:0')
tensor([36.0159], device='cuda:0') tensor(164.1594, device='cuda:0')
tensor([40.0552], device='cuda:0') tensor(171.8425, device='cuda:0')
tensor([40.4369], device='cuda:0') tensor(171.8547, device='cuda:0')
tensor([37.2149], device='cuda:0')

tensor([0.], device='cuda:0') tensor(0.2184, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2238, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2452, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2547, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2745, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2898, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3064, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3271, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3346, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3679, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3833, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.4119, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.4398, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.4790, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5021, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5491, device='cuda:0')
tensor([

tensor([0.], device='cuda:0') tensor(28.9140, device='cuda:0')
tensor([0.], device='cuda:0') tensor(29.3939, device='cuda:0')
tensor([0.], device='cuda:0') tensor(30.2045, device='cuda:0')
tensor([0.], device='cuda:0') tensor(30.9033, device='cuda:0')
tensor([0.], device='cuda:0') tensor(31.5204, device='cuda:0')
tensor([0.], device='cuda:0') tensor(32.5611, device='cuda:0')
tensor([0.], device='cuda:0') tensor(33.2934, device='cuda:0')
tensor([0.], device='cuda:0') tensor(33.8671, device='cuda:0')
tensor([0.], device='cuda:0') tensor(34.5710, device='cuda:0')
tensor([0.], device='cuda:0') tensor(35.0442, device='cuda:0')
tensor([0.], device='cuda:0') tensor(35.6867, device='cuda:0')
tensor([0.], device='cuda:0') tensor(36.9928, device='cuda:0')
tensor([0.], device='cuda:0') tensor(37.7904, device='cuda:0')
tensor([0.], device='cuda:0') tensor(38.6204, device='cuda:0')
tensor([0.], device='cuda:0') tensor(39.5935, device='cuda:0')
tensor([0.], device='cuda:0') tensor(41.0732, device='c

tensor([0.], device='cuda:0') tensor(2.5119, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.5506, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.5995, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.6802, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.7596, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.8472, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.9007, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.9350, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.9715, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.0448, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.0982, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.1908, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.2274, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.2643, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.3140, device='cuda:0')
tensor([0.], device='cuda:0') tensor(3.3606, device='cuda:0')
tensor([

tensor([0.], device='cuda:0') tensor(38.3233, device='cuda:0')
tensor([0.], device='cuda:0') tensor(39.2624, device='cuda:0')
tensor([0.], device='cuda:0') tensor(40.6467, device='cuda:0')
tensor([0.], device='cuda:0') tensor(42.8638, device='cuda:0')
tensor([0.], device='cuda:0') tensor(44.0272, device='cuda:0')
tensor([0.], device='cuda:0') tensor(45.7726, device='cuda:0')
tensor([0.], device='cuda:0') tensor(49.3870, device='cuda:0')
tensor([0.], device='cuda:0') tensor(52.6147, device='cuda:0')
tensor([0.], device='cuda:0') tensor(56.0653, device='cuda:0')
tensor([0.], device='cuda:0') tensor(61.7377, device='cuda:0')
tensor([0.], device='cuda:0') tensor(69.3846, device='cuda:0')
tensor([0.], device='cuda:0') tensor(82.8295, device='cuda:0')
tensor([0.], device='cuda:0') tensor(84.8563, device='cuda:0')
tensor([0.], device='cuda:0') tensor(88.9272, device='cuda:0')
tensor([0.], device='cuda:0') tensor(94.6404, device='cuda:0')
tensor([0.], device='cuda:0') tensor(95.9162, device='c

tensor([0.], device='cuda:0') tensor(0.3086, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3324, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3468, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3691, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3827, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.4021, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.4198, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.4510, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.4780, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5001, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5261, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5655, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.5997, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.6328, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.6755, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.7290, device='cuda:0')
tensor([

tensor([0.], device='cuda:0') tensor(6.7577, device='cuda:0')
tensor([0.], device='cuda:0') tensor(7.0920, device='cuda:0')
tensor([0.], device='cuda:0') tensor(7.2057, device='cuda:0')
tensor([0.], device='cuda:0') tensor(7.5407, device='cuda:0')
tensor([0.], device='cuda:0') tensor(7.7171, device='cuda:0')
tensor([0.], device='cuda:0') tensor(8.3197, device='cuda:0')
tensor([0.], device='cuda:0') tensor(8.5876, device='cuda:0')
tensor([0.], device='cuda:0') tensor(9.1911, device='cuda:0')
tensor([0.], device='cuda:0') tensor(9.7699, device='cuda:0')
tensor([0.], device='cuda:0') tensor(11.0630, device='cuda:0')
tensor([0.8779], device='cuda:0') tensor(13.8113, device='cuda:0')
tensor([23.6388], device='cuda:0') tensor(124.8869, device='cuda:0')
tensor([24.2284], device='cuda:0') tensor(144.4246, device='cuda:0')
tensor([24.7507], device='cuda:0') tensor(145.1247, device='cuda:0')
tensor([24.8335], device='cuda:0') tensor(145.3397, device='cuda:0')
tensor([22.0739], device='cuda:0') t

tensor([0.], device='cuda:0') tensor(1.1403, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.2147, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.2826, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.3896, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.4478, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.4653, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.5047, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.5438, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.6167, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.7340, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.7729, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.8457, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.9062, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.9668, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.0098, device='cuda:0')
tensor([0.], device='cuda:0') tensor(2.0508, device='cuda:0')
tensor([

tensor([0.], device='cuda:0') tensor(21.0243, device='cuda:0')
tensor([0.], device='cuda:0') tensor(21.4863, device='cuda:0')
tensor([0.], device='cuda:0') tensor(21.9501, device='cuda:0')
tensor([0.], device='cuda:0') tensor(22.8307, device='cuda:0')
tensor([0.], device='cuda:0') tensor(23.7556, device='cuda:0')
tensor([0.], device='cuda:0') tensor(24.3248, device='cuda:0')
tensor([0.], device='cuda:0') tensor(24.7502, device='cuda:0')
tensor([0.], device='cuda:0') tensor(25.2582, device='cuda:0')
tensor([0.], device='cuda:0') tensor(25.9004, device='cuda:0')
tensor([0.], device='cuda:0') tensor(26.7610, device='cuda:0')
tensor([0.], device='cuda:0') tensor(26.9983, device='cuda:0')
tensor([0.], device='cuda:0') tensor(27.7028, device='cuda:0')
tensor([0.], device='cuda:0') tensor(28.1498, device='cuda:0')
tensor([0.], device='cuda:0') tensor(28.5686, device='cuda:0')
tensor([0.], device='cuda:0') tensor(29.2891, device='cuda:0')
tensor([0.], device='cuda:0') tensor(30.0062, device='c

tensor([27.4758], device='cuda:0') tensor(102.1307, device='cuda:0')
tensor([28.9054], device='cuda:0') tensor(102.6415, device='cuda:0')
tensor([30.4315], device='cuda:0') tensor(104.3461, device='cuda:0')
tensor([30.5043], device='cuda:0') tensor(104.6905, device='cuda:0')
tensor([29.4186], device='cuda:0') tensor(109.8573, device='cuda:0')
tensor([29.3281], device='cuda:0') tensor(110.1441, device='cuda:0')
tensor([30.8661], device='cuda:0') tensor(112.0167, device='cuda:0')
tensor([29.7597], device='cuda:0') tensor(117.7723, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.0643, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.0959, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.1474, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.1919, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.2541, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.2992, device='cuda:0')
tensor([0.], device='cuda:0') tensor(1.3377, device='cuda:0')
tensor([0.], d

tensor([0.], device='cuda:0') tensor(0.2282, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2324, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2375, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2437, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2522, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2616, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2675, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2783, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2886, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.2979, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3088, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3149, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3242, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3341, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3447, device='cuda:0')
tensor([0.], device='cuda:0') tensor(0.3528, device='cuda:0')
tensor([

tensor([0.], device='cuda:0') tensor(27.4658, device='cuda:0')
tensor([1.2917], device='cuda:0') tensor(32.6484, device='cuda:0')
tensor([20.3119], device='cuda:0') tensor(93.7605, device='cuda:0')
tensor([29.3725], device='cuda:0') tensor(114.1282, device='cuda:0')
tensor([39.0006], device='cuda:0') tensor(243.9217, device='cuda:0')
tensor([38.7424], device='cuda:0') tensor(244.4858, device='cuda:0')
tensor([39.6974], device='cuda:0') tensor(244.7702, device='cuda:0')
tensor([38.0925], device='cuda:0') tensor(244.9629, device='cuda:0')
tensor([39.4759], device='cuda:0') tensor(258.3909, device='cuda:0')
tensor([41.9953], device='cuda:0') tensor(259.1558, device='cuda:0')
tensor([40.7435], device='cuda:0') tensor(261.1582, device='cuda:0')
tensor([41.3207], device='cuda:0') tensor(263.0711, device='cuda:0')
tensor([40.4825], device='cuda:0') tensor(263.3926, device='cuda:0')
tensor([41.2169], device='cuda:0') tensor(278.3797, device='cuda:0')
tensor([42.2622], device='cuda:0') tensor(2

In [None]:
MAPE = mean_absolute_percentage_error(, runtimes_pred)
RMSE = np.sqrt(mean_squared_error(runtimes, runtimes_pred))

In [13]:
_yt.cpu().data

tensor(5.6456)

In [15]:
predictions.cpu().data.numpy()

array([5.524684], dtype=float32)