In [2]:
import cudf
import os
import time
import datetime
import pandas

In [3]:
home_dir = os.path.expanduser("~")
data_dir = os.path.join(home_dir, "datasets")
print("data directory: {}".format(data_dir))

data directory: /home/badscooter23/datasets


In [None]:
def get_csv_files(directory):
    csv_files = []
    for entry in os.scandir(directory):
        if entry.is_dir(follow_symlinks=False):
            sub_dir_csv_files = get_csv_files(entry)
            for sub_dir_csv_file in sub_dir_csv_files:
                csv_files.append(sub_dir_csv_file)
            
        elif entry.path.endswith(".csv"):
            # print(".csv")
            csv_files.append(entry)

    return csv_files
 

In [None]:
biggest_size = 0

for csv_file in get_csv_files(data_dir):
    csv_file_name = csv_file.path
    csv_file_size = csv_file.stat(follow_symlinks=False).st_size
    print("file: {}, size: {} (bytes)".format(csv_file_name, csv_file_size))
    if csv_file_size > biggest_size:
        biggest_size = csv_file_size
        biggest_file = csv_file
        
print("\nbiggest file: {}".format(biggest_file.path))
print("size: {} (bytes)".format(biggest_size))
        
        
        

   
    

In [None]:
start_time = time.time()
gpu_df = cudf.read_csv(biggest_file.path)
gpu_read_time = time.time() - start_time

In [None]:
start_time = time.time()
pd_df = pandas.read_csv(biggest_file.path)
pd_read_time = time.time() - start_time

In [None]:
def ratio(gpu_time, pandas_time):
    return pandas_time / gpu_time


In [None]:
print("GPU is {}x faster (or slower)...".format(round(ratio(gpu_read_time, pd_read_time), 2)))

In [None]:
def equivalent_to_n(gpu_calculated_value, pd_calculated_value):
    
    MAX_N = 30
    for digits in range(1,MAX_N):
        # print(digits)
        
        g = round(gpu_calculated_value,digits) 
        p = round(pd_calculated_value, digits)
        
        if g != p:
            break
            
    return digits-1


In [None]:
print(equivalent_to_n(3.141516, 3.141520))

In [None]:
gpu_df.head()

In [None]:
gpu_df.columns

In [None]:
start_time = time.time()
gpu_sum = []
for c in gpu_df.columns:
    if c == 'image_id':
        gpu_sum.append(0)
    else:
        gpu_sum.append(gpu_df[c].sum())
gpu_sum_time = time.time() - start_time
    

In [None]:
start_time = time.time()
pd_sum = []
for c in pd_df.columns:
    if c == 'image_id':
        pd_sum.append(0)
    else:
        pd_sum.append(pd_df[c].sum())
pd_sum_time = time.time() - start_time


In [None]:
print("GPU is {}x faster (or slower)...".format(round(ratio(gpu_sum_time, pd_sum_time), 2)))

In [None]:
print(len(pd_sum)==len(gpu_sum))
print(len(pd_sum)==pd_df.shape[1])
print(pd_df.shape)
print(len(gpu_sum)==gpu_df.shape[1])
print(gpu_df.shape)

In [None]:
for i in range(0,len(gpu_sum)):
    print("equal: {}, cudf sum: {} pandas sum: {}".format(gpu_sum[i]==pd_sum[i],gpu_sum[i], pd_sum[i]))