# Creating Data Subsets

In [None]:
import cudf
import sys
import pandas as pd
import time

from cuml import make_regression
from cuml.linear_model import LinearRegression as LinearRegression_rapids
from cuml.neighbors import KNeighborsRegressor as KNeighborsRegressor_rapids
from sklearn.linear_model import LinearRegression as LinearRegression_sklearn
from sklearn.neighbors import KNeighborsRegressor as KNeighborsRegressor_sklearn

## Creating Tests

In [None]:
NUM_START_SAMPLES = 2500

NUM_EXECUTIONS_PER_TEST = 3

NUM_DSIZE_DOUBLINGS = 11

RANDOM_STATE = 23

NUM_FEATURES = 399

In [None]:
def linear_regression_rapids():    
    ols = LinearRegression_rapids(fit_intercept=True,
                                       normalize=True,
                                       algorithm='eig')    
    # Starting timer
    t0 = time.time()   

    for i in range(0,NUM_EXECUTIONS_PER_TEST):
        ols.fit(X, y)

    # Stopping clock
    t1 = time.time()

    # Recording Results
    total_time = t1-t0
    avg_time = total_time/NUM_EXECUTIONS_PER_TEST
    
    del ols

    return total_time, avg_time

def k_nearest_neighbour_regression_rapids():
    knn = KNeighborsRegressor_rapids(n_neighbors=5)
    
    # Starting timer
    t0 = time.time()   

    for i in range(0,NUM_EXECUTIONS_PER_TEST):
        knn.fit(X, y)

    # Stopping clock
    t1 = time.time()

    # Recording Results
    total_time = t1-t0
    avg_time = total_time/NUM_EXECUTIONS_PER_TEST
    
    del knn

    return total_time, avg_time

def linear_regression_sklearn():    
    ols = LinearRegression_sklearn(fit_intercept=True,
                                   normalize=True,
                                   n_jobs=-1) 
    # Starting timer
    t0 = time.time()   

    for i in range(0,NUM_EXECUTIONS_PER_TEST):
        ols.fit(X, y)

    # Stopping clock
    t1 = time.time()

    # Recording Results
    total_time = t1-t0
    avg_time = total_time/NUM_EXECUTIONS_PER_TEST

    return total_time, avg_time

def k_nearest_neighbour_sklearn():
    knn = KNeighborsRegressor_sklearn(n_neighbors=5)
    
    # Starting timer
    t0 = time.time()   

    for i in range(0,NUM_EXECUTIONS_PER_TEST):
        knn.fit(X, y)

    # Stopping clock
    t1 = time.time()

    # Recording Results
    total_time = t1-t0
    avg_time = total_time/NUM_EXECUTIONS_PER_TEST

    return total_time, avg_time


## Executing Tests

### RAPIDS

In [None]:
results = []

for i in range(0, NUM_DSIZE_DOUBLINGS):
    print('Test:', i)
    if i == 0:
        n_samples = NUM_START_SAMPLES
    else:
        n_samples = n_samples * 2
        
    # Creating the data
    X, y = make_regression(n_samples=n_samples, n_features=NUM_FEATURES, random_state=RANDOM_STATE)
    
    # Using a cudf dataframe causes a big slowdown, so its commented out
#     X = cudf.DataFrame(X)
#     y = cudf.DataFrame(y)[0]
        
    # ******************************************************************************
    # LINEAR REGRESSION TEST
    
    test = {'Test':'Linear Regression'}
    total_time, avg_time = linear_regression_rapids()
    test['Total'] = total_time
    test['Average'] = avg_time
    
    results.append(test)
    
    # ******************************************************************************
    
    # K NEAREST NEIGHBOUR TEST
    
    test = {'Test':'K Nearest Neighbour'}
    total_time, avg_time = k_nearest_neighbour_regression_rapids()
    test['Total'] = total_time
    test['Average'] = avg_time
    
    results.append(test)
    
    # ******************************************************************************
        
results_df_rapids = pd.DataFrame(results)
results_df_rapids.head()

### PANDAS

In [None]:
results = []

for i in range(0, NUM_DSIZE_DOUBLINGS):
    print('Test:', i)
    
    if i == 0:
        n_samples = NUM_START_SAMPLES
    else:
        n_samples = n_samples * 2
        
    # Creating the data
    X, y = make_regression(n_samples=n_samples, n_features=NUM_FEATURES, random_state=RANDOM_STATE)
    
    X = cudf.DataFrame(X).to_pandas()
    y = cudf.DataFrame(y).to_pandas()
    
    dfSize_GB = sys.getsizeof(pd.concat([X,y],axis=1)) * 10**(-9)
    
        
    # ******************************************************************************
    # LINEAR REGRESSION TEST
    
    test = {'Test Size':dfSize_GB, 'Test':'Linear Regression'}
    total_time, avg_time = linear_regression_sklearn()
    test['Total'] = total_time
    test['Average'] = avg_time
    
    results.append(test)
    
    # ******************************************************************************
    
    # K NEAREST NEIGHBOUR TEST
    
    test = {'Test Size':dfSize_GB, 'Test':'K Nearest Neighbour'}
    total_time, avg_time = k_nearest_neighbour_sklearn()
    test['Total'] = total_time
    test['Average'] = avg_time
    
    results.append(test)
    
    # ******************************************************************************

        
results_df_sklearn = pd.DataFrame(results)
results_df_sklearn

## Results

In [None]:
results_df = pd.concat( 
    [
        pd.concat({"Scikit-Learn": results_df_sklearn}, axis=1), 
        pd.concat({"Rapids": results_df_rapids}, axis=1)
    ],
    axis=1
)
results_df.to_csv('sklearn_v_rapids_results.csv')
results_df.head()