# Arkouda Lightning Tutorial

## Connect to the Arkouda server

In [None]:
import arkouda as ak
ak.connect('localhost', 5555) 

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import time
import gc

## Read Parquet Taxi Cab Data

In [None]:
start = time.time()

data = ak.read_parquet('/lus/scratch/mcdonald/data20-24/*', has_non_float_nulls=True)

stop = time.time()
print(f"Read Parquet files execution time: {stop - start:.2f} seconds")

In [None]:
data = ak.DataFrame(data)
print(f"Total amount of data: {(data.size*4)/(1024*1024):.2f} GBs")

In [None]:
data

## Data Exploration

More information on the Arkouda API can be found at: https://bears-r-us.github.io/arkouda/

### Descriptive statistics

In [None]:
def describe(x):
    fmt = 'mean: {}\nstd: {}\nmin: {}\nmax: {}'
    if x.dtype == ak.float64:
        fmt = fmt.format(*['{:.2f}' for _ in range(4)])
    print(fmt.format(x.mean(), x.std(), x.min(), x.max()))

In [None]:
describe(data['fare_amount'])

#### Histogram

In [None]:
import numpy as np
from matplotlib import pyplot as plt

def hist(x, bins, log=True):
    assert bins > 0
    h, bins = ak.histogram(x, bins)
    plt.bar(bins[:-1].to_ndarray(), h.to_ndarray(), width=bins[1]-bins[0])
    if log:
        plt.yscale('log')

In [None]:
hist(data['fare_amount'], 100)

#### GroupBy: Construct a Graph

In [None]:
start = time.time()

byloc = ak.GroupBy([data['PULocationID'], data['DOLocationID']])

stop = time.time()
print(f"Total amount of data: {(data['PULocationID'].size*4*2)/(1024*1024):.2f} GBs")
print(f"GroupBy execution time: {stop - start}")

#### Broadcast: Find Rides with Anomalous Fares

In [None]:
(u, v), w = byloc.size()

In [None]:
start = time.time()

_, mf = byloc.mean(data['fare_amount'])

stop = time.time()
print(f"Mean execution time: {stop - start:.2f} seconds")

In [None]:
start = time.time()

sf = (byloc.sum(data['fare_amount']**2)[1] / w) - mf**2

stop = time.time()
print(f"Standard dev execution time: {stop - start:.2f} seconds")

In [None]:
data['fare_mean'] = byloc.broadcast(mf, permute=True)
data['fare_std'] = byloc.broadcast(sf, permute=True)

In [None]:
data['fare_z'] = (data['fare_amount'] - data['fare_mean']) / (data['fare_std'] + 1)

In [None]:
data

In [None]:
hist(data['fare_z'], 100)

In [None]:
worst = data['fare_z'].argmax()
{k:v[worst] for k, v in data.items()}