In [None]:
# importing all the required libraries
import dask.array as da
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from dask.distributed import Client, LocalCluster
import dask

In [None]:
# here we load the data using numpy and do some pre-processing
X_data = np.load('/home/pzaspel/data/md17_X.npy')
y_data = np.load('/home/pzaspel/data/md17_Y.npy')

scaler = StandardScaler()
X = scaler.fit_transform(X_data)

y = y_data - np.average(y_data, 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(y_train.shape)
print(y_test.shape)

In [None]:
# here we initialize the cluster i.e set of machines
# n_worker refers to no of workes or cores that we need
cluster = LocalCluster(host="titlis.clamv.jacobs-university.de",scheduler_port=0, dashboard_address="titlis.clamv.jacobs-university.de:0", n_workers=1, threads_per_worker=1, memory_limit='500GB')
client = Client(cluster)
client

In [None]:
# this is the main forward stagewise regression function
# no of iterations refers to how many iterations we wanna run the regression 

def forward_stagewise(X_train, X_test, y_train, y_test, no_of_iterations):
    
    start = time.time()

    # we initialize the varibales we need
    m, n =  X_train.shape
    beta = np.zeros(shape=(n, 1))
    residual = client.persist(y_train)
    X_train_transpose = client.persist(X_train.transpose()) 

    # active set is null at the begining 
    active_variable = set()

    # list to record validation error at each iteration
    mse_test = []

    # no_varibles keeps record of variables after each variables gets added to our model
    # mainly for plotting 
    no_variables = []

    # this tracks the number of varibles at each iteration
    # mainly for plotting
    variables_iterations = []

    # iterations is to count no of iterations we have completed
    # mainly for plotting
    iterations = 1

    # this tracks if number of variables changes or not at each iteration
    prev_variables = 0

    # patience is the stopping criteria for iterations
    # if after patience level 50, none of the variable gets added to our model
    # we break out of the loop
    patience = 0

    for j in range(no_of_iterations):

        # corr matrix stores the corelation of our variables with residual
        corr = client.persist(da.dot(X_train_transpose, residual))

        # i is the index of variable mostly corelated with our residual
        i = client.persist(da.argmax(da.fabs(corr)))
        index = i.compute()

        # we add that index to active variable list
        active_variable.add(index)

        # to keep track of the sign of variable that we just added
        if corr[i] < 0:
            sign = -1
        else:
            sign = 1

        # increase the cofficient by some constant i.e. learning rate multiplied with sign
        beta[index, 0] = beta[index, 0] + (0.05 * sign)

        # we load beta as dask array for parallel computation
        beta_in_dask = client.persist(da.from_array(beta))

        # this is our train prediction at each iteration
        y_hat = client.persist(da.dot(X_train, beta_in_dask))

        # new residual
        residual = client.persist(y_train - y_hat)

        total_var = len(active_variable)
        variables_iterations.append(total_var)

        # if new variables gets added to our model, we calculate validation error
        if (prev_variables != total_var):
            y_pred = client.persist(da.dot(X_test, beta_in_dask))
            test_residual = client.persist(y_test - y_pred)
            mse_test.append((da.square(test_residual).mean()).compute())
            prev_variables = total_var
            no_variables.append(prev_variables)
            patience = 0

        # if no new variable gets added, we increase patience level by 1
        else:
            patience += 1

        # if total no of variables reaches the dimension of data or patience level reaches 50
        # we break the loop
        if (prev_variables == n or patience == 50):
            break

        # print total number of variables in active set in every 100 iterations
        if (iterations % 100 == 0):
            print(total_var)

        iterations += 1

    end = time.time()
    execution_time = (end-start)

    return execution_time, mse_test, no_variables, iterations, variables_iterations

    

In [None]:
# if you want to make variable plot, set this to true
make_varibale_plots = False

# intialize list to plot execution time vs no of cores plot
no_of_cores = []
total_execution_times = []

# set no of iterations as you want
no_of_iterations = 50

# to scale cluster, we go from 1 to 40
for k in range(1, 41):
    client.restart()
    client.cluster.scale(k)
    
    # we load the data into the cluster
    future1 = client.scatter(X_train)
    future2 = client.scatter(X_test)
    future3 = client.scatter(y_train)
    future4 = client.scatter(y_test)
    
    # for just one worker, we do not chunk the data
    # we retrive the data from cluster as the dask array
    if (k == 1):
        X_t = da.from_delayed(future1, shape=X_train.shape, dtype=X_train.dtype)
        X_t = X_t.rechunk((695265, 3121))
        X_t = X_t.persist()
        client.rebalance(X_t)

        X_te = da.from_delayed(future2, shape=X_test.shape, dtype=X_test.dtype)
        X_te = X_te.rechunk((297972, 3121))
        X_te = X_te.persist()
        client.rebalance(X_te)

    
        y_t = da.from_delayed(future3, shape=y_train.shape, dtype=y_train.dtype)
        y_t = y_t.rechunk((695265, 1))
        y_t = y_t.persist()
        client.rebalance(y_t)
    
        y_te = da.from_delayed(future4, shape=y_test.shape, dtype=y_test.dtype)
        y_te = y_te.rechunk((297972, 1))
        y_te = y_te.persist()
        client.rebalance(y_te)
    
    # if there are more workers, chunk the data
    # we retrive the data from cluster as the dask array
    # then, chunk the data
    # spread the chunks over the cluster
    else:
        
        
        X_t = da.from_delayed(future1, shape=X_train.shape, dtype=X_train.dtype)
        X_t = X_t.rechunk((7000, 3121))
        X_t = X_t.persist()
        client.rebalance(X_t)

        X_te = da.from_delayed(future2, shape=X_test.shape, dtype=X_test.dtype)
        X_te = X_te.rechunk((7000, 3121))
        X_te = X_te.persist()
        client.rebalance(X_te)


        y_t = da.from_delayed(future3, shape=y_train.shape, dtype=y_train.dtype)
        y_t = y_t.rechunk((7000, 1))
        y_t = y_t.persist()
        client.rebalance(y_t)


        y_te = da.from_delayed(future4, shape=y_test.shape, dtype=y_test.dtype)
        y_te = y_te.rechunk((7000, 1))
        y_te = y_te.persist()
        client.rebalance(y_te)
    
    execution_time, mse_test, no_variables, iterations, variables_iterations = forward_stagewise(
                                                                                              X_t, 
                                                                                              X_te, 
                                                                                              y_t, 
                                                                                              y_te, 
                                                                                              no_of_iterations)
    if (make_varibale_plots == True):
        plt.plot(no_variables, mse_test, '-b', label='Validation Error')
        plt.legend(loc='upper right')
        plt.title('Validation Error vs No of Variables')
        plt.xlabel('No of Variables')
        plt.ylabel('Mean Squared Error')
        plt.show()
        
        plt.plot(list(range(1, iterations)), variables_iterations, color='orange')
        plt.title('No of variables vs Iterations')
        plt.xlabel('No of iterations')
        plt.ylabel('No of variables')
        plt.show()
    
    total_execution_times.append(execution_time)
    no_of_cores.append(k)



In [None]:
# here we plot no of cores vs speedup
total_execution_times = np.array(total_execution_times)
speed_up = total_execution_times[0] / total_execution_times
plt.plot(no_of_cores, speed_up, color='orange', label='speedup')
plt.legend(loc='best')
plt.title('Speedup vs No of Cores')
plt.xlabel('No of Cores')
plt.ylabel('Speedup')
plt.show()