In [1]:
import cudf
import os
import time
import datetime
import pandas

In [2]:
home_dir = os.path.expanduser("~")
data_dir = os.path.join(home_dir, "datasets")

In [3]:
def get_csv_files(directory):
    csv_files = []
    for entry in os.scandir(directory):
        if entry.is_dir(follow_symlinks=False):
            sub_dir_csv_files = get_csv_files(entry)
            for sub_dir_csv_file in sub_dir_csv_files:
                csv_files.append(sub_dir_csv_file)
            
        elif entry.path.endswith(".csv"):
            # print(".csv")
            csv_files.append(entry)

    return csv_files
 

In [5]:
biggest_size = 0

for csv_file in get_csv_files(data_dir):
    csv_file_name = csv_file.path
    csv_file_size = csv_file.stat(follow_symlinks=False).st_size
    print("file: {}, size: {} (bytes)".format(csv_file_name, csv_file_size))
    if csv_file_size > biggest_size:
        biggest_size = csv_file_size
        biggest_file = csv_file
        
print("\nbiggest file: {}".format(biggest_file.path))
print("size: {} (bytes)".format(biggest_size))

file: /home/badscooter23/datasets/airlines/DelayedFlights.csv, size: 247963212 (bytes)
file: /home/badscooter23/datasets/celeba-dataset/list_eval_partition.csv, size: 2836404 (bytes)
file: /home/badscooter23/datasets/celeba-dataset/list_landmarks_align_celeba.csv, size: 9932092 (bytes)
file: /home/badscooter23/datasets/celeba-dataset/list_bbox_celeba.csv, size: 5390926 (bytes)
file: /home/badscooter23/datasets/celeba-dataset/list_attr_celeba.csv, size: 24913339 (bytes)
file: /home/badscooter23/datasets/celeba-dataset/customer.csv, size: 779474 (bytes)

biggest file: /home/badscooter23/datasets/airlines/DelayedFlights.csv
size: 247963212 (bytes)


In [6]:
start_time = time.time()
gpu_df = cudf.read_csv(biggest_file.path)
gpu_read_time = time.time() - start_time

In [7]:
start_time = time.time()
pd_df = pandas.read_csv(biggest_file.path)
pd_read_time = time.time() - start_time

In [8]:
def ratio(gpu_time, pandas_time):
    return pandas_time / gpu_time


In [9]:
print("GPU is {}x faster (or slower)...".format(round(ratio(gpu_read_time, pd_read_time), 2)))

GPU is 3.49x faster (or slower)...


In [10]:
def equivalent_to_n(gpu_calculated_value, pd_calculated_value):
    
    MAX_N = 30
    for digits in range(1,MAX_N):
        # print(digits)
        
        g = round(gpu_calculated_value,digits) 
        p = round(pd_calculated_value, digits)
        
        if g != p:
            break
            
    return digits-1


In [11]:
print(equivalent_to_n(3.141516, 3.141520))

5


In [12]:
gpu_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,...,4.0,8.0,0,N,0,,,,,
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,...,5.0,10.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,...,3.0,17.0,0,N,0,,,,,
3,4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,...,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
4,5,2008,1,3,4,1940.0,1915,2121.0,2110,WN,...,4.0,10.0,0,N,0,,,,,


In [13]:
gpu_df.columns

Index(['Unnamed: 0', 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime',
       'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum',
       'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

In [14]:
gpu_df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,...,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,1936758.0,1936758.0,1936758.0,1936758.0,1936758.0,1936758.0,1936758.0,1929648.0,1936758.0,1936758.0,...,1936758.0,1929648.0,1936303.0,1936758.0,1936758.0,1247488.0,1247488.0,1247488.0,1247488.0,1247488.0
mean,3341651.0,2008.0,6.111106,15.75347,3.984827,1518.534,1467.473,1610.141,1634.225,2184.263,...,765.6862,6.812975,18.2322,0.000327,0.004004,19.1794,3.703571,15.02163,0.090137,25.29647
std,2066065.0,0.0,3.482546,8.776272,1.995966,450.4853,424.7668,548.1781,464.6347,1944.702,...,574.4797,5.273595,14.33853,0.018076,0.063147,43.54621,21.4929,33.83305,2.022714,42.05486
min,0.0,2008.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1517452.0,2008.0,3.0,8.0,2.0,1203.0,1135.0,1316.0,1325.0,610.0,...,338.0,4.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3242558.0,2008.0,6.0,16.0,4.0,1545.0,1510.0,1715.0,1705.0,1543.0,...,606.0,6.0,14.0,0.0,0.0,2.0,0.0,2.0,0.0,8.0
75%,4972467.0,2008.0,9.0,23.0,6.0,1900.0,1815.0,2030.0,2014.0,3422.0,...,998.0,8.0,21.0,0.0,0.0,21.0,0.0,15.0,0.0,33.0
max,7009727.0,2008.0,12.0,31.0,7.0,2400.0,2359.0,2400.0,2400.0,9742.0,...,4962.0,240.0,422.0,1.0,1.0,2436.0,1352.0,1357.0,392.0,1316.0


In [18]:
gpu_df.profile_report()

AttributeError: 'DataFrame' object has no attribute 'profile_report'

In [87]:
start_time = time.time()
gpu_sum = []
for c in gpu_df.columns:
    if c == 'image_id':
        gpu_sum.append(0)
    else:
        gpu_sum.append(gpu_df[c].sum())
gpu_sum_time = time.time() - start_time
    

In [88]:
start_time = time.time()
pd_sum = []
for c in pd_df.columns:
    if c == 'image_id':
        pd_sum.append(0)
    else:
        pd_sum.append(pd_df[c].sum())
pd_sum_time = time.time() - start_time


In [89]:
print("GPU is {}x faster (or slower)...".format(round(ratio(gpu_sum_time, pd_sum_time), 2)))

GPU is 0.79x faster (or slower)...


In [94]:
print(len(pd_sum)==len(gpu_sum))
print(len(pd_sum)==pd_df.shape[1])
print(pd_df.shape)
print(len(gpu_sum)==gpu_df.shape[1])
print(gpu_df.shape)

True
True
(202599, 41)
True
(202599, 41)


In [98]:
for i in range(0,len(gpu_sum)):
    print("equal: {}, cudf sum: {} pandas sum: {}".format(gpu_sum[i]==pd_sum[i],gpu_sum[i], pd_sum[i]))

equal: True, cudf sum: 0 pandas sum: 0
equal: True, cudf sum: -157567 pandas sum: -157567
equal: True, cudf sum: -94419 pandas sum: -94419
equal: True, cudf sum: 5067 pandas sum: 5067
equal: True, cudf sum: -119707 pandas sum: -119707
equal: True, cudf sum: -193505 pandas sum: -193505
equal: True, cudf sum: -141181 pandas sum: -141181
equal: True, cudf sum: -105029 pandas sum: -105029
equal: True, cudf sum: -107567 pandas sum: -107567
equal: True, cudf sum: -105655 pandas sum: -105655
equal: True, cudf sum: -142633 pandas sum: -142633
equal: True, cudf sum: -181975 pandas sum: -181975
equal: True, cudf sum: -119455 pandas sum: -119455
equal: True, cudf sum: -144993 pandas sum: -144993
equal: True, cudf sum: -179273 pandas sum: -179273
equal: True, cudf sum: -183681 pandas sum: -183681
equal: True, cudf sum: -176213 pandas sum: -176213
equal: True, cudf sum: -177167 pandas sum: -177167
equal: True, cudf sum: -185601 pandas sum: -185601
equal: True, cudf sum: -45819 pandas sum: -45819
eq