In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
file_name = "/content/drive/My Drive/data/data.csv"

In [0]:
import os

In [5]:
print ("Size of file is {} MB".format(os.path.getsize(file_name) >> 20))

Size of file is 271 MB


In [6]:
!nvidia-smi 

Sun Jan 19 18:13:52 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [7]:
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


In [8]:
!python -V; pip -V

Python 3.6.9
pip 19.3.1 from /usr/local/lib/python3.6/dist-packages/pip (python 3.6)


In [9]:
!pip install numba



In [0]:
import os
os.environ['NUMBAPRO_NVVM']='/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE']='/usr/local/cuda/nvvm/libdevice/'

In [11]:
!pip install cudf-cuda100



In [0]:
!cp /usr/local/lib/python3.6/dist-packages/librmm.so .

In [0]:
# setting up the required environments
import os
os.environ['NUMBAPRO_NVVM']='/usr/local/cuda-10.0/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE']='/usr/local/cuda-10.0/nvvm/libdevice'

In [0]:
import time
import numpy as np
import pandas as pd
import cudf

# Read Time Comparision

In [15]:
s = time.time()
df_pandas = pd.read_csv(file_name)
e = time.time()
pd_time = e - s
print("Pandas Loading Time = {}".format(pd_time))

Pandas Loading Time = 4.076458692550659


In [16]:
s = time.time()
df_cudf = cudf.read_csv(file_name)
e = time.time()
cuDF_time = e - s 
print("cuDF Loading Time = {}".format(cuDF_time))

cuDF Loading Time = 0.9165005683898926


In [17]:
pd_time > cuDF_time

True

In [18]:
pd_time / cuDF_time

4.447851788801592

# Concat Time Comparision

In [19]:
s = time.time()
df_pandas_concat = pd.concat([df_pandas for _ in range(5)])
e = time.time()
pd_time = e - s
print("Pandas Concat Time = {}".format(pd_time))

Pandas Concat Time = 2.448513984680176


In [20]:
s = time.time()
df_cudf_concat = cudf.concat([df_cudf for _ in range(5)])
e = time.time()
cuDF_time = e - s
print("cuDF Concat Time = {}".format(cuDF_time))

cuDF Concat Time = 0.24041986465454102


In [21]:
pd_time > cuDF_time

True

In [22]:
pd_time / cuDF_time

10.184324777815021

# Mean Value Time Comparision

In [23]:
s = time.time()
mean_pandas = df_pandas["event_time"].mean()
e = time.time()
pd_time = e - s
print ("Mean value in Pandas Dataframe = {}".format(mean_pandas))
print("Pandas Mean Value Calculation Time = {}".format(pd_time))

Mean value in Pandas Dataframe = 501.0634608268819
Pandas Mean Value Calculation Time = 0.01664280891418457


In [24]:
s = time.time()
max_cudf = df_cudf["event_time"].mean()
e = time.time()
cuDF_time = e-s
print ("Mean value in DataTable Dataframe = {}".format(max_cudf))
print("DataTable Mean Value Calculation Time = {}".format(cuDF_time))

Mean value in DataTable Dataframe = 501.0634608268819
DataTable Mean Value Calculation Time = 0.003912210464477539


In [25]:
pd_time > cuDF_time

True

In [26]:
pd_time / cuDF_time

4.254067889572795

# Functions of cuDF 

In [29]:
dir(df_cudf["event_time"])

['__add__',
 '__array_ufunc__',
 '__bool__',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__pow__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmod__',
 '__rmul__',
 '__rsub__',
 '__rtruediv__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__weakref__',
 '_binaryop',
 '_column',
 '_concat',
 '_copy_construct',
 '_copy_construct_defaults',
 '_get_mask_as_series',
 '_index',
 '_n_largest_or_smallest',
 '_normalize_binop_value',
 '_ordered_compare',
 '_rbinaryop',
 '_sort',
 '_unaryop',
 '_unordered_compare',
 'append',
 'applymap',
 'argsort',
 'as_index',
 'as_mask',
 'astype',
 'cat',
 'ceil',


# Sorting

In [30]:
s = time.time()
df_pandas.sort_values(by='event_time')
e = time.time()
pd_time = e - s
print("Pandas Sorting Time = {}".format(pd_time))

Pandas Sorting Time = 3.7633862495422363


In [31]:
s = time.time()
df_cudf['event_time'].sort_values()
e = time.time()
cuDF_time = e-s
print("cuDF Sorting Time = {}".format(cuDF_time))

cuDF Sorting Time = 0.30028295516967773


In [32]:
pd_time > cuDF_time

True

In [33]:
pd_time / cuDF_time

12.532800096547936