In [1]:
import csv
import os
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
pwd = os.getcwd()
print(pwd)

/Users/bhargobdeka/Desktop/cuTAGI/python_examples


In [3]:
# go one step out of the current directory
os.chdir('..')
pwd = os.getcwd()
print(pwd)

/Users/bhargobdeka/Desktop/cuTAGI


In [4]:
from python_examples.data_loader import RegressionDataLoader
from python_examples.regression import Regression
from pytagi import NetProp

In [5]:
# text_file_path = '/Users/bhargobdeka/Desktop/cuTAGI/data/UCI/Boston_housing/data/data.txt'

data_name = 'data/UCI/Boston_housing'

data = np.loadtxt(data_name + '/data/data.txt')

In [6]:
print(data.shape)

(506, 14)


In [7]:
# We load the indexes for the features and for the target

index_features = np.loadtxt(data_name +'/data/index_features.txt').astype(int)
index_target   = np.loadtxt(data_name +'/data/index_target.txt').astype(int)

In [8]:
## User-defined parameters
# splits
n_splits  = 1

# User-input
num_inputs = 13     # 1 explanatory variable
num_outputs = 1     # 1 predicted output
num_epochs = 50     # row for 50 epochs
BATCH_SIZE = 1      # batch size

In [9]:
## classes
class HeterosUCIMLP(NetProp):
    """Multi-layer preceptron for regression task where the
    output's noise varies overtime"""

    def __init__(self) -> None:
        super().__init__()
        self.layers =       [1, 1, 1]
        self.nodes =        [13, 50, 2]  # output layer = [mean, std]
        self.activations =  [0, 4, 0]
        self.batch_size =   BATCH_SIZE
        self.sigma_v =      0
        self.sigma_v_min =  0
        self.noise_gain =   1
        self.noise_type =   "heteros"
        self.init_method =  "He"
        self.device =       "cpu"

## Functions
def create_data_loader(raw_input: np.ndarray, raw_output: np.ndarray, batch_size) -> list:
        """Create dataloader based on batch size"""
        num_input_data = raw_input.shape[0]
        num_output_data = raw_output.shape[0]
        assert num_input_data == num_output_data

        # Even indices
        even_indices = split_evenly(num_input_data, batch_size)

        if np.mod(num_input_data, batch_size) != 0:
            # Remider indices
            rem_indices = split_reminder(num_input_data, batch_size)
            even_indices.append(rem_indices)

        indices = np.stack(even_indices)
        input_data = raw_input[indices]
        output_data = raw_output[indices]
        dataset = []
        for x_batch, y_batch in zip(input_data, output_data):
            dataset.append((x_batch, y_batch))
        return dataset


def split_evenly(num_data, chunk_size: int):
    """split data evenly"""
    indices = np.arange(int(num_data - np.mod(num_data, chunk_size)))

    return np.split(indices, int(np.floor(num_data / chunk_size)))

def split_reminder(num_data: int, chunk_size: int):
        """Pad the reminder"""
        indices = np.arange(num_data)
        reminder_start = int(num_data - np.mod(num_data, chunk_size))
        num_samples = chunk_size - (num_data - reminder_start)
        random_idx = np.random.choice(indices, size=num_samples, replace=False)
        reminder_idx = indices[reminder_start:]

        return np.concatenate((random_idx, reminder_idx))



In [10]:
# Input data and output data
X = data[ : , index_features.tolist() ]
Y = data[ : , index_target.tolist() ]
input_dim = X.shape[1]

In [11]:
X.shape, Y.shape

((506, 13), (506,))

In [12]:
# loading the specific train and test indices
index_train = np.loadtxt(data_name +"/data/index_train_{}.txt".format(0)).astype(int)
index_test = np.loadtxt(data_name +"/data/index_test_{}.txt".format(0)).astype(int)

# Train and Test data for the current split
x_train = X[ index_train.tolist(), ]
y_train = Y[ index_train.tolist() ]
y_train = np.reshape(y_train,[len(y_train),1]) #BD
x_test  = X[ index_test.tolist(), ]
y_test  = Y[ index_test.tolist() ]
y_test = np.reshape(y_test,[len(y_test),1])    #BD

In [13]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((455, 13), (455, 1), (51, 13), (51, 1))

In [None]:
# first few rows of the training or test data
print(list(x_test[:5,]))

In [14]:
from pytagi import Normalizer, Utils

# Normalizer
normalizer: Normalizer = Normalizer()

x_mean, x_std = normalizer.compute_mean_std(x_train)
y_mean, y_std = normalizer.compute_mean_std(y_train)

# x_mean, x_std = normalizer.compute_mean_std(
#     np.concatenate((x_train, x_test))
# )
# y_mean, y_std = normalizer.compute_mean_std(
#     np.concatenate((y_train, y_test))
# )

# Normalizing the data
x_train = normalizer.standardize(data=x_train, mu=x_mean, std=x_std)
y_train = normalizer.standardize(data=y_train, mu=y_mean, std=y_std)
x_test = normalizer.standardize(data=x_test, mu=x_mean, std=x_std)
y_test = normalizer.standardize(data=y_test, mu=y_mean, std=y_std)

In [15]:
# print y_mean, y_std
print(y_mean, y_std)

[22.77846154] [9.32785371]


In [16]:
# Dataloader
data_loader = {}
data_loader["train"] = (x_train, y_train)
data_loader["test"] = create_data_loader(
    raw_input=x_test, raw_output=y_test, batch_size=BATCH_SIZE
)
data_loader["x_norm_param_1"] = x_mean
data_loader["x_norm_param_2"] = x_std
data_loader["y_norm_param_1"] = y_mean
data_loader["y_norm_param_2"] = y_std

In [17]:
# Model
net_prop = HeterosUCIMLP()

# Regression loader
reg_data_loader = RegressionDataLoader(num_inputs=num_inputs,
                                    num_outputs=num_outputs,
                                    batch_size=net_prop.batch_size)


In [18]:
# Regression task
reg_task = Regression(num_epochs=num_epochs,
                    data_loader=data_loader,
                    net_prop=net_prop)

In [None]:
# Train the model
reg_task.train()


In [None]:
# Predict
reg_task.predict()