# Creating Data Subsets

In [2]:
import cudf
import sys
import pandas as pd
import time

from cuml import make_regression
from cuml.linear_model import LinearRegression as LinearRegression_rapids
from cuml.neighbors import KNeighborsRegressor as KNeighborsRegressor_rapids
from sklearn.linear_model import LinearRegression as LinearRegression_sklearn
from sklearn.neighbors import KNeighborsRegressor as KNeighborsRegressor_sklearn

## Creating Tests

In [3]:
NUM_EXECUTIONS_PER_TEST = 3

NUM_DSIZE_DOUBLINGS = 11

In [4]:
def linear_regression_rapids():    
    ols = LinearRegression_rapids(fit_intercept=True,
                                       normalize=True,
                                       algorithm='eig')    
    # Starting timer
    t0 = time.time()   

    for i in range(0,NUM_EXECUTIONS_PER_TEST):
        ols.fit(X, y)

    # Stopping clock
    t1 = time.time()

    # Recording Results
    total_time = t1-t0
    avg_time = total_time/NUM_EXECUTIONS_PER_TEST
    
    del ols

    return total_time, avg_time

def k_nearest_neighbour_regression_rapids():
    knn = KNeighborsRegressor_rapids(n_neighbors=5)
    
    # Starting timer
    t0 = time.time()   

    for i in range(0,NUM_EXECUTIONS_PER_TEST):
        knn.fit(X, y)

    # Stopping clock
    t1 = time.time()

    # Recording Results
    total_time = t1-t0
    avg_time = total_time/NUM_EXECUTIONS_PER_TEST
    
    del knn

    return total_time, avg_time

## Executing Tests

### RAPIDS

In [11]:
results = []

for i in range(0, NUM_DSIZE_DOUBLINGS):
    X = cudf.read_csv(f"ml_data/{i}/X.csv.gz")
    y = X.iloc[:,-1]
    X = X.iloc[:,0:399]
        
    # ******************************************************************************
    # LINEAR REGRESSION TEST
    
    test = {'Test':'Linear Regression'}
    total_time, avg_time = linear_regression_rapids()
    test['Total'] = total_time
    test['Average'] = avg_time
    
    results.append(test)
    
    # ******************************************************************************
    
    # K NEAREST NEIGHBOUR TEST
    
    test = {'Test':'K Nearest Neighbour'}
    total_time, avg_time = k_nearest_neighbour_regression_rapids()
    test['Total'] = total_time
    test['Average'] = avg_time
    
    results.append(test)
    
#     ******************************************************************************
        
results_df_rapids = pd.DataFrame(results)
results_df_rapids.to_csv('results_df_rapids_ml.csv')