# Arkouda Lightning Tutorial

## Connect to the Arkouda server

In [None]:
import arkouda as ak
ak.connect('localhost', 5555) 

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import time
import gc

## Read Parquet Taxi Cab Data

In [None]:
start = time.time()

data = ak.read('/lus/scratch/mcdonald/data20-24/*', has_non_float_nulls=True)

stop = time.time()
print(f"Read Parquet files execution time: {stop - start:.2f} seconds")

In [None]:
data

In [None]:
data = ak.DataFrame(data)
print(f"Total amount of data: {(data.size * 4)/(1024*1024):.2f} GBs")

In [None]:
data

## Data Exploration

More information on the Arkouda API can be found at: https://bears-r-us.github.io/arkouda/

### Descriptive statistics

In [None]:
def describe(x):
    fmt = 'mean: {}\nstd: {}\nmin: {}\nmax: {}'
    if x.dtype == ak.float64:
        fmt = fmt.format(*['{:.2f}' for _ in range(4)])
    print(fmt.format(x.mean(), x.std(), x.min(), x.max()))

In [None]:
describe(data['fare_amount'])

#### Histogram

Arkouda supports transferring of distribtued arrays back to the client through ZMQ (messaging layer, similar to TCP sockets) and converting them to NumPy ndarrays. 

This can be useful for taking a portion of an Arkouda array to operate on at a smaller scale and interoparate with existing Python tools, like PySpark. In this example, a single column of our Arkouda array is converted to a NumPy array in order to work with MatPlotLib.

In [None]:
import numpy as np
from matplotlib import pyplot as plt

def hist(x, bins, log=True):
    assert bins > 0
    h, bins = ak.histogram(x, bins)
    plt.bar(bins[:-1].to_ndarray(), h.to_ndarray(), width=bins[1]-bins[0])
    if log:
        plt.yscale('log')

In [None]:
hist(data['fare_amount'], 100)

# Taxi Zone Lookup Table

### Use method 1: CSV -> Pandas -> Arkouda
Arkouda supports working with Pandas DataFrames, which can be used to augment large Arkouda pdarrays or DataFrames, or converted to Arkouda pdarrays to get better performance.

In [None]:
import pandas as pd

def cvt_to_string(v):
    try:
        if v == '':
            return 'N/A'
        else:
            return str(v)
    except:
        return 'N/A'

cvt = {'Borough':cvt_to_string, 'Zone':cvt_to_string, 'service zone':cvt_to_string}
tzlut = pd.read_csv('./taxi_zone_lookup.csv', converters=cvt)

top_row = pd.DataFrame({'LocationID': [0], 'Borough': ['N/A'], 'service_zone':['N/A']})
tzlut = pd.concat([top_row, tzlut]).reset_index(drop=True)

In [None]:
tzlut

#### Convert Pandas DF to Arkouda DF

In [None]:
def ak_create_from_df(df):
    akdict = {}
    for cname in df.keys():
        if df[cname].dtype.name == 'object':
            akdict[cname] = ak.from_series(df[cname],dtype=str)
        else:
            akdict[cname] = ak.from_series(df[cname])
    return ak.DataFrame(akdict)

In [None]:
aktzlut = ak_create_from_df(tzlut)

In [None]:
aktzlut

#### Apply Lookup Table

After ensuring that our array is zero-up indexed, we can enhance our Arkouda dictionary by broadcasting values.

In [None]:
(aktzlut['LocationID'] == ak.arange(aktzlut['LocationID'].size)).all()

In [None]:
data['PUBorough'] = aktzlut['Borough'][data['PULocationID']]
data['DOBorough'] = aktzlut['Borough'][data['DOLocationID']]

In [None]:
data['PUZone'] = aktzlut['Zone'][data['PULocationID']]
data['DOZone'] = aktzlut['Zone'][data['DOLocationID']]

In [None]:
data

#### GroupBy: Construct a Graph

In [None]:
import time

start = time.time()
byloc = ak.GroupBy([data['PULocationID'], data['DOLocationID']])
stop = time.time()
print(f"Total amount of data: {(data['PULocationID'].size*4*2)/(1024*1024):.2f} GBs")
print(f"GroupBy execution time: {stop - start}")

#### Broadcast: Find Rides with Anomalous Fares
Compute mean and stddev of fare by (pickup, dropoff)

In [None]:
start = time.time()

_, mf = byloc.mean(data['fare_amount'])

stop = time.time()
print(f"Total amount of data: {(data['PULocationID'].size*4*2)/(1024*1024):.2f} GBs")
print(f"Mean execution time: {stop - start:.2f} seconds")

In [None]:
start = time.time()

(u, v), w = byloc.size()

stop = time.time()
print(f"Mean execution time: {stop - start:.2f} seconds")

In [None]:
start = time.time()

sf = (byloc.sum(data['fare_amount']**2)[1] / w) - mf**2

stop = time.time()
print(f"Standard dev execution time: {stop - start:.2f} seconds")

Broadcast group values back to ride dataframe to compute z-score of rides

In [None]:
data['fare_mean'] = byloc.broadcast(mf, permute=True)
data['fare_std'] = byloc.broadcast(sf, permute=True)

In [None]:
data

In [None]:
data['fare_z'] = (data['fare_amount'] - data['fare_mean']) / (data['fare_std'] + 1)

In [None]:
hist(data['fare_z'], 100)

#### Bring Small Result Set Back to Pandas
Now, we've explored our data, we've discovered insights, and now we know what we need to look at: rides that are extraordinarily expensive! 

To work with that how you would any other Python code, the pdarray can be converted from the server side as a pdarray to the client side as a NumPy ndarray or Pandas DataFrame.

In [None]:
exorbitant = (data['fare_z'] > 2)
exdf = pd.DataFrame({k: v[exorbitant].to_ndarray() for k, v in data.items()})
print(f"Number of exorbitant rows: {exdf.size/27}")

In [None]:
exdf.head(5)

In [None]:
worst = data['fare_z'].argmax()
{k:v[worst] for k, v in data.items()}