## Demo easy groupby and sorting with hyperplane 

In [1]:
# !pip install s3fs

In [1]:
import warnings
import os
import sys
import pandas as pd
import numpy as np
import dask
import dask.dataframe as dd
from dask.distributed import Client
from typing import List, Set, Dict, Tuple, Optional
import types
from google.cloud import storage
from tqdm.notebook import tqdm

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt 
%matplotlib inline

pd.options.display.max_rows = 999
warnings.filterwarnings('ignore')

from hyperplane import notebook_common as nc

#### Below the magic cell to set parameters that can be passed in through pipeline jobs
Later when this notebook is used in a production pipeline, the variables set in this cell can be changed to something else, by adding this like to the graphql query

`
parameters: {create: {key: "data_url", value: "some_other_data_url"}}
`


In [3]:
data_url = "s3://dask-data/airline-data"

#### Below is the one liner to scale up the job to kubernetes

In [4]:
client, cluster = nc.initialize_cluster(
        nprocs=1,
        nthreads=15,
        ram_gb_per_proc=12,
        cores_per_worker=15,
        scheduler_deploy_mode="remote",
        num_workers = 3
    )


👉 Hyperplane: selecting worker node pool
👉 Hyperplane: selecting scheduler node pool
Creating scheduler pod on cluster. This may take some time.
👉 Hyperplane: spinning up a dask cluster with a scheduler as a standalone container.
👉 Hyperplane: In a few minutes you'll be able to access the dashboard at https://ds.hyperplane.dev/dask-cluster-abaf4d2e-0314-40e3-a134-e20c5d42357a/status
👉 Hyperplane: to get logs from all workers, do `cluster.get_logs()`


In [5]:
## install any necessary custom packages on the remote node image 
def install_package_on_remote():
    import os
    return os.system("pip install s3fs")
client.run(install_package_on_remote)

{'tcp://10.1.101.10:46191': 0,
 'tcp://10.1.102.9:39851': 0,
 'tcp://10.1.103.7:38527': 0}

In [6]:
%%time
df = dd.read_csv(f"{data_url}/*.csv", 
#                  blocksize = 25e6, 
                 storage_options = {'anon': True},
                usecols = ['DepTime','FlightNum','DepDelay','Origin', 'Dest','Distance'],
                dtype={'Distance': 'float64',
                      'DepTime':'float64',
                      'FlightNum':'int64',
                      'DepDelay':'float64',
                      'Dest':'object',
                      'Origin':'object'}, 
                encoding = "ISO-8859-1")

print(f"number of rows, {df.map_partitions(len).compute().sum()}")
print(f"total size {df.memory_usage_per_partition().compute().sum()/1024./1024./1024.} G")
df.head(2)


number of rows, 123534969
total size 5.522466823458672 G
CPU times: user 771 ms, sys: 96.4 ms, total: 867 ms
Wall time: 1min 37s


Unnamed: 0,DepTime,FlightNum,DepDelay,Origin,Dest,Distance
0,741.0,1451,11.0,SAN,SFO,447.0
1,729.0,1451,-1.0,SAN,SFO,447.0


In [7]:
%%time
# lazy groupby and sorting to get the 10 largest trade per ticker
df_sort = df.groupby('Origin').apply(lambda x : x.nlargest(n = 10, columns = 'Distance'))
df_sort

CPU times: user 21.1 ms, sys: 1.27 ms, total: 22.4 ms
Wall time: 20.9 ms


Unnamed: 0_level_0,DepTime,FlightNum,DepDelay,Origin,Dest,Distance
npartitions=196,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,float64,int64,float64,object,object,float64
,...,...,...,...,...,...
...,...,...,...,...,...,...
,...,...,...,...,...,...
,...,...,...,...,...,...


In [8]:
%%time
# actual compute of the groupby sorting result
df_sort_local = df_sort.compute()
df_sort_local

CPU times: user 1.21 s, sys: 51.1 ms, total: 1.26 s
Wall time: 2min 19s


Unnamed: 0_level_0,Unnamed: 1_level_0,DepTime,FlightNum,DepDelay,Origin,Dest,Distance
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
FAT,187185,630.0,6800,17.0,FAT,MEM,1666.0
FAT,187189,1420.0,6802,0.0,FAT,MEM,1666.0
FAT,188229,634.0,6800,21.0,FAT,MEM,1666.0
FAT,188233,1418.0,6802,0.0,FAT,MEM,1666.0
FAT,189301,636.0,6800,23.0,FAT,MEM,1666.0
...,...,...,...,...,...,...,...
ORD,37335,1026.0,1,-4.0,ORD,HNL,4243.0
ORD,37336,1032.0,1,2.0,ORD,HNL,4243.0
ORD,37337,1028.0,1,-2.0,ORD,HNL,4243.0
ORD,37338,1028.0,1,-2.0,ORD,HNL,4243.0


## Compare with pandas 
Note the code below is going to crush the kernel or kill the instance due to Out of Memory Error

In [None]:
%%time
import pandas as pd
df_pd = df.compute()

In [None]:
## crushed the kernel
%%time
df_sort_pd = df_pd.groupby('Origin').apply(lambda x : x.nlargest(n = 10, columns = 'Distance'))

## close cluster after done
It's a good idea to close the cluster after use. If you forgot to add this cell, don't worry :) Hyperplane will automatically garbage collect the node after it detects it's being idle for a bit


In [9]:
cluster.close()