# A billion stars in the Jupyter Notebook
## École Polytechnique - Paris  2018

In [None]:
1+1

In [None]:
import IPython.display
IPython.display.Image(filename="logo_polytechnique.png")

$ E \approx mc^2$

In [None]:
import vaex
import numpy as np
import matplotlib.pylab as plt
np.warnings.filterwarnings('ignore')

plt.style.use('bigfont')
%matplotlib inline

# Step 0: reading in data
vaex reads 'anything':
 * `ds = vaex.open('super_fast.hdf5')`
 * `ds = vaex.open('gadget_is_fine.hdf5')`
 * `ds = vaex.from_pandas(df)`
 * `ds = vaex.from_astropy_table(table)`
 * `ds = vaex.from_ascii('takes_hours.asc')`
 * `ds = vaex.from_csv('this_may_be_slow.csv')`
 * `ds = vaex.from_arrays(x=x, y=y)`

In [None]:
%%time
ds = vaex.open("/Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5")
!ls -lh /Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5

In [None]:
ds

In [None]:
ds.trip_distance

In [None]:
np.log10(ds.trip_distance)

## 0 dimensional

In [None]:
ds.count()

In [None]:
ds.count(ds.pickup_latitude)

In [None]:
ds.mean(ds.pickup_latitude)

## 1 dimensional

In [None]:
ds.count(binby=ds.pickup_latitude, limits=[40.5, 41])

In [None]:
plt.plot(_)

## 2 dimensional

In [None]:
counts2d = ds.count(binby=[ds.pickup_longitude, ds.pickup_latitude], shape=128)
print(counts2d.shape)

In [None]:
plt.imshow(np.log10(counts2d+1).T, origin='lower')

In [None]:
limits = ds.limits([ds.pickup_longitude, ds.pickup_latitude], "98%")
limits

In [None]:
%%time
ds.plot(ds.pickup_longitude, ds.pickup_latitude, f="log1p",
        limits=limits, figsize=(10,8), shape=512, colormap="viridis")

In [None]:
# %%timeit
# counts2d = ds.count(binby=["pickup_longitude", "pickup_latitude"], shape=128, limits=limits)#, limits=[[-90, 90], [-180, 180]])

## Where to pick up customers?

In [None]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude, what=vaex.stat.mean(ds.total_amount),
        vmin=0, vmax=50, shape=512, figsize=(10,8), limits=limits, colormap="Greys")

In [None]:
ds.trip_distance.minmax()

In [None]:
ds.plot1d(ds.trip_distance, limits=[0, 50])

In [None]:
ds = ds[(ds.trip_distance > 0) & (ds.trip_distance < 40)] # no memory copy! and not wasting 46 GB of memory

In [None]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude,
         what=vaex.stat.mean(ds.total_amount/ds.trip_distance),
         vmin=0, vmax=15,
         shape=512, figsize=(10,8), limits=limits, colormap="Greys")

## Lazy expressions and virtual columns

In [None]:
# do not do
#ratio = ds.data.total_amount/ds.data.trip_distance
print(len(ds.data.total_amount) * 8 / 1024**3, "GB")

In [None]:
ds.total_amount/ds.trip_distance

In [None]:
#ds.add_virtual_column("ratio", "total_amount/trip_distance")
ds['ratio'] = ds.total_amount / ds.trip_distance

In [None]:
ds.mean(ds.ratio), ds.ratio.mean()

In [None]:
def arc_distance(theta_1, phi_1, theta_2, phi_2):
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return distance * 6400

In [None]:
arc_distance(0, 0, 0, 180)

In [None]:
ds["arc_distance"] = arc_distance(ds.pickup_longitude,  ds.pickup_latitude,
                                ds.dropoff_longitude, ds.dropoff_latitude)

In [None]:
%%time
ds.arc_distance.mean()

In [None]:
ds['arc_distance_jit'] = ds.arc_distance.jit_numba()
# ds['arc_distance_jit'] = ds.arc_distance.jit_pythran()

In [None]:
%%time
ds.arc_distance_jit.mean()

In [None]:
ds['extra'] = (ds.trip_distance*1.6 - ds.arc_distance_jit)

In [None]:
ds.select(ds.pickup_longitude != ds.dropoff_longitude)

In [None]:
ds.extra.mean(selection=True), ds.extra.minmax(selection=True)

In [None]:
ds.plot1d(ds.extra, selection=True, limits=[-5, 10])

In [None]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude, what=vaex.stat.mean(ds.extra),
       selection=True, vmin=0, vmax=3,
       shape=512, figsize=(10,8), limits=limits, colormap="Greys")

## A Billion stars in the notebook

In [None]:
# gaia = vaex.open("ws://gaia:9000/gaia-dr1")
import vaex
%matplotlib inline
gaia = vaex.open('/Users/maartenbreddels/datasets/gaia/gaia-dr1-minimal_f4.hdf5')

In [None]:
f"{len(gaia):,}"

In [None]:
%%time
ra_dec_limits = [[0, 360], [-90, 90]]
gaia.plot("ra", "dec", limits=ra_dec_limits, f="log", figsize=(10,8), shape=400)
#plt.savefig('gaia-backup.png')