# Creating Data Subsets

In [4]:
import cudf
import sys
import pandas as pd
import time

## Reading in Datasets

In [5]:
# Reading in the Carbon Dioxide Dataset
co_df = cudf.read_csv("1980-2008-CO.csv")
co_df["RAW_VALUE"] = co_df.RAW_VALUE.astype(float)
co_df["ROUNDED_VALUE"] = co_df.RAW_VALUE.astype(float)

In [7]:
# Reading in the CO station information
station_co_df = cudf.read_csv("bc_air_monitoring_stations.csv")

## Creating Tests

In [8]:
def mean(co_subset):
    # Computing the average raw value
    return co_subset['RAW_VALUE'].mean()

def sort(co_subset):
    # Sorting based on the raw CO value 
    return co_subset.sort_values(['RAW_VALUE'], ascending=True)

def merge(co_subset, station_co_df):
    # Merging with the CO Stations to find which station made each measurement
    return co_subset.merge(station_co_df, how='left', on='STATION_NAME')

def filters(co_subset):
    # Filtering to the "Victoria Topaz" station
    filterd_pan_df = co_subset[co_subset['STATION_NAME'] == 'Victoria Topaz']
    
def all_tests(co_subset, station_co_df):
    # Computing the average raw value
    pan_mean = co_subset['RAW_VALUE'].mean()

    # Sorting based on the raw CO value 
    pan_sort_df = co_subset.sort_values(['RAW_VALUE'], ascending=True)

    # Merging with the CO Stations to find which station made each measurement
    panmerge_df = co_subset.merge(station_co_df, how='left', on='STATION_NAME')

    # Filtering to the "Victoria Topaz" station
    filterd_pan_df = co_subset[co_subset['STATION_NAME'] == 'Victoria Topaz']
    
def tester(test, co_subset, station_co_df, NUM_EXECUTIONS_PER_TEST):    
    if test == 'mean':
        # Starting timer
        t0 = time.time()   

        for i in range(0,NUM_EXECUTIONS_PER_TEST):
            mean(co_subset)
            
        # Stopping clock
        t1 = time.time()

        # Recording Results
        total_time = t1-t0
        avg_time = total_time/NUM_EXECUTIONS_PER_TEST

        return total_time, avg_time
    
    elif test == 'sort':
        # Starting timer
        t0 = time.time()   

        for i in range(0,NUM_EXECUTIONS_PER_TEST):
            sort(co_subset)
            
        # Stopping clock
        t1 = time.time()

        # Recording Results
        total_time = t1-t0
        avg_time = total_time/NUM_EXECUTIONS_PER_TEST

        return total_time, avg_time
      
    elif test == 'merge':
        # Starting timer
        t0 = time.time()   

        for i in range(0,NUM_EXECUTIONS_PER_TEST):
            merge(co_subset, station_co_df)
            
        # Stopping clock
        t1 = time.time()

        # Recording Results
        total_time = t1-t0
        avg_time = total_time/NUM_EXECUTIONS_PER_TEST

        return total_time, avg_time
    elif test == 'filter':
        # Starting timer
        t0 = time.time()   

        for i in range(0,NUM_EXECUTIONS_PER_TEST):
            filters(co_subset)
            
        # Stopping clock
        t1 = time.time()

        # Recording Results
        total_time = t1-t0
        avg_time = total_time/NUM_EXECUTIONS_PER_TEST

        return total_time, avg_time
    else:
        # Starting timer
        t0 = time.time()   

        for i in range(0,NUM_EXECUTIONS_PER_TEST):
            all_tests(co_subset, station_co_df)
            
        # Stopping clock
        t1 = time.time()

        # Recording Results
        total_time = t1-t0
        avg_time = total_time/NUM_EXECUTIONS_PER_TEST

        return total_time, avg_time
        
          
    # Stopping clock
    t1 = time.time()
    
    # Recording Results
    total_time = t1-t0
    avg_time = total_time/NUM_EXECUTIONS_PER_TEST
    
    return total_time, avg_time
        

In [9]:
NUM_START_ROWS = 2500

NUM_EXECUTIONS_PER_TEST = 3

NUM_DSIZE_DOUBLINGS = 12

## Executing Tests

### RAPIDS

In [10]:
numRows = NUM_START_ROWS

rapids = []
for i in range(0, NUM_DSIZE_DOUBLINGS):
    print('Test:', i)
    
    numRows = numRows * 2
    
    if numRows > len(co_df):
        co_subset = co_subset.append(co_subset)
    else:
        co_subset = co_df.iloc[0:numRows]
        
    # ******************************************************************************
    # MEAN TEST
    
    test = {'Test':'Mean'}
    total_time, avg_time = tester('mean', co_subset, station_co_df, NUM_EXECUTIONS_PER_TEST)
    test['Total'] = total_time
    test['Average'] = avg_time
    
    rapids.append(test)
    
    # ******************************************************************************
    
    # SORT TEST
    
    test = {'Test':'Sort'}
    
    total_time, avg_time = tester('sort', co_subset, station_co_df, NUM_EXECUTIONS_PER_TEST)
    
    test['Total'] = total_time
    test['Average'] = avg_time
    
    rapids.append(test)
    
    # ******************************************************************************
    
    # MERGE TEST
    
    test = {'Test':'Merge'}
    
    total_time, avg_time = tester('merge', co_subset, station_co_df, NUM_EXECUTIONS_PER_TEST)
    
    test['Total'] = total_time
    test['Average'] = avg_time
    
    rapids.append(test)
    
    # ******************************************************************************
    
    # FILTER TEST
    
    test = {'Test':'Filter'}
    
    total_time, avg_time = tester('filter', co_subset, station_co_df, NUM_EXECUTIONS_PER_TEST)
    
    test['Total'] = total_time
    test['Average'] = avg_time
        
    rapids.append(test)
    
    # ******************************************************************************
    
    # ALL TEST
    
    test = {'Test':'All'}
    
    total_time, avg_time = tester('all', co_subset, station_co_df, NUM_EXECUTIONS_PER_TEST)
    
    test['Total'] = total_time
    test['Average'] = avg_time
    
    rapids.append(test)
    
    # ******************************************************************************
    
results_df_rapids = pd.DataFrame(rapids)

Test: 0


In [12]:
results_df_rapids.to_csv('results_df_rapids_etl.csv')