In [2]:
# == recnn ==
import sys
sys.path.append("../../")
import recnn

## RecNN supports different types of pandas backends
### for faster loading/processing in and out of core


![here be pandas logo](https://dev.pandas.io/static/img/pandas.svg "Pandas")

#### Pandas is you default backend.
 (no need to set it like that)

In [8]:
# but you can also set it directly:
recnn.pd.set("pandas")
frame_size = 10
batch_size = 25
dirs = recnn.data.env.DataPath(
    base="../../data/",
    embeddings="embeddings/ml20_pca128.pkl",
    ratings="ml-20m/ratings.csv",
    cache="cache/frame_env.pkl", # cache will generate after you run
    use_cache=False
)

In [3]:
%%time
env = recnn.data.env.FrameEnv(dirs, frame_size, batch_size)

100%|██████████| 20000263/20000263 [00:13<00:00, 1469488.15it/s]
100%|██████████| 20000263/20000263 [00:15<00:00, 1265183.17it/s]
100%|██████████| 138493/138493 [00:06<00:00, 19935.53it/s]
CPU times: user 41.6 s, sys: 1.89 s, total: 43.5 s
Wall time: 43.5 s


<recnn.data.env.FrameEnv at 0x7f28bd9fe7c0>

![here be modin logo](https://modin.readthedocs.io/en/latest/_images/MODIN_ver2_hrz.png "Modin") 

Modin uses Ray or Dask to provide an effortless way to speed up your pandas notebooks, scripts, and libraries. Unlike other distributed DataFrame libraries, Modin provides seamless integration and compatibility with existing pandas code. Even using the DataFrame constructor is identical.

![here be Ray logo](https://github.com/ray-project/ray/raw/master/doc/source/images/ray_header_logo.png "Ray") 

A fast and simple framework for building and running distributed applications. Ray is packaged with RLlib, a scalable reinforcement learning library, and Tune, a scalable hyperparameter tuning library.

In [9]:
import os
import ray

if ray.is_initialized():
    ray.shutdown()
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
ray.init(num_cpus=10) # adjust for your liking
recnn.pd.set("modin")

2020-08-09 16:55:54,693	INFO resource_spec.py:204 -- Starting Ray with 4.98 GiB memory available for workers and up to 2.51 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-08-09 16:55:55,069	INFO services.py:1163 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


In [10]:
%%time
env = recnn.data.env.FrameEnv(dirs, frame_size, batch_size)

100%|██████████| 138493/138493 [00:07<00:00, 18503.97it/s]
CPU times: user 12 s, sys: 2.06 s, total: 14 s
Wall time: 21.4 s


![here be Ray logo](https://dask.org/_images/dask_horizontal_white_no_pad.svg "Ray")  

## Dask is a flexible library for parallel computing in Python.

In [3]:
### dask
import os
os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask
recnn.pd.set("modin")

In [4]:
%%time
env = recnn.data.env.FrameEnv(dirs, frame_size, batch_size)

100%|██████████| 138493/138493 [00:06<00:00, 19785.99it/s]
CPU times: user 14.2 s, sys: 2.13 s, total: 16.3 s
Wall time: 22 s


<recnn.data.env.FrameEnv at 0x7f623fb30250>

# Free 2x increase in load speed!

### Pandas Wall time: 40.6 s
### Modin/Ray  Wall time: 20.8S
### Modin/Dusk Wall time: 22 s
