In [1]:
# =============================================================
# Copyright © 2020 Intel Corporation
# 
# SPDX-License-Identifier: MIT
# =============================================================

# Modin Getting Started Example for Distributed Pandas

## Importing and Organizing Data

In this example we will be generating a **synthetic dataset** and **demonstrating stock Pandas operations running with Modin**.

Let's start by **importing** all the necessary packages and modules

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import time

## How to Use Modin

We will also be importing **stock Pandas as pd** and **Modin as mpd to show differentiation**. You can see importing Modin is simple and **does not require any additional steps.**

In [3]:
import pandas

In [4]:
import os
os.environ["MODIN_ENGINE"] = "ray" 

In [5]:
import modin.pandas as pd

We will now **generate a synthetic dataset** using NumPy to use with Modin and save it to a CSV.

In [6]:
#array=np.random.randint(low=100,high=10000,size=(2**18,2**8))
array=np.random.randint(low=100,high=10000,size=(2**18,2**8))

#array
np.savetxt("foo.csv", array, delimiter=",") #how to generate array

Now we will convert the ndarray into a Pandas dataframe and display the first five rows.
For **stock pandas, the dataframe is being stored as `pandas_df`** and for **Modin, the same dataframe is being stored as `modin_df`**.
Let's try running the following cell with Pandas first.

In [7]:
%%time
for i in range(10):
    pandas_df = pandas.read_csv("foo.csv", names=["col{}".format(i) for i in range(256)])

    pandas_df.head()

CPU times: user 1min 50s, sys: 4.7 s, total: 1min 55s
Wall time: 1min 55s


Now let's run the same code, but use **Modin instead of stock Pandas.**

**Note the speedup!**

In [8]:
import ray
ray.init()

2021-06-16 16:45:32,463	INFO services.py:1274 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.180.87',
 'raylet_ip_address': '192.168.180.87',
 'redis_address': '192.168.180.87:42727',
 'object_store_address': '/tmp/ray/session_2021-06-16_16-45-31_229172_8967/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-06-16_16-45-31_229172_8967/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-06-16_16-45-31_229172_8967',
 'metrics_export_port': 61071,
 'node_id': '6089c3c8d32dee41c10cdcc9c25f83fe8baf3a7b6ead64602ae571ac'}

In [9]:
%%time
for i in range(10):
    modin_df=pd.read_csv("foo.csv", names=["col{}".format(i) for i in range(256)])
    #modin_df=pd.read_csv("foo.csv", names=["col{}".format(i) for i in range(256)])

modin_df.head()

CPU times: user 6.17 s, sys: 9.96 s, total: 16.1 s
Wall time: 28 s


Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col246,col247,col248,col249,col250,col251,col252,col253,col254,col255
0,4068.0,8808.0,4894.0,4925.0,5842.0,722.0,5087.0,7172.0,467.0,7242.0,...,6548.0,4643.0,9058.0,6611.0,342.0,1556.0,8405.0,6051.0,7088.0,5109.0
1,6752.0,9652.0,965.0,2561.0,5383.0,4483.0,4521.0,4815.0,7208.0,1533.0,...,966.0,5532.0,8372.0,3270.0,1764.0,5596.0,3427.0,9925.0,9134.0,3550.0
2,5744.0,5465.0,515.0,1149.0,8833.0,5546.0,8316.0,1818.0,3207.0,8936.0,...,8648.0,9204.0,6252.0,1010.0,7662.0,4781.0,9881.0,4948.0,9694.0,1366.0
3,5114.0,6279.0,4959.0,1327.0,1633.0,9836.0,6443.0,7534.0,7467.0,1911.0,...,9360.0,5262.0,5964.0,1841.0,6683.0,6336.0,2252.0,4073.0,6386.0,1108.0
4,4877.0,1947.0,2776.0,4447.0,9094.0,6878.0,9952.0,7397.0,5090.0,1312.0,...,1850.0,2000.0,843.0,7139.0,8241.0,9485.0,9090.0,972.0,9027.0,1795.0


Let's now **visualize** this speedup from Modin with a plot!

In [None]:
def plotter(outputdict):
    fig = plt.figure(figsize = (10, 5)) 
    plt.bar(outputdict.keys(),outputdict.values(),color='blue',width=0.4)
    plt.xlabel("Python Package")
    plt.ylabel("Runtime(seconds)")
    plt.show()

In [None]:
t0 = time.time()
pandas_df = pandas.read_csv("foo.csv", names=["col{}".format(i) for i in range(256)])
pandas_time = time.time()- t0

t1 = time.time()
modin_df = pd.read_csv("foo.csv", names=["col{}".format(i) for i in range(256)])
modin_time = time.time() - t1

print("Pandas Time(seconds):",pandas_time,"\nModin Time(seconds):",modin_time)
outputDict={"Pandas":pandas_time,"Modin":modin_time}
plotter(outputDict)

## Other DataFrame Function Performance Example
We will now show the speedup in performance from Modin compared to stock Pandas with a few common functions.

Like before, **`pandas_df` is for  stock Pandas**, **`modin_df` is for Modin**.

### `df.mean()`

In [None]:
# Mean
t2 = time.time()
pandas_df.mean(axis=0)
pandas_time=time.time()- t2
print(" stock Pandas wall time for completion in seconds:",pandas_time)

In [None]:
# Mean
t3 = time.time()
modin_df.mean(axis=0)
modin_time=time.time()- t3
print("Modin wall time for completion in seconds:",modin_time)

In [None]:
print("Modin was {}X faster than stock Pandas!".format(round(pandas_time/modin_time, 2)))

### `df.applymap`

In [None]:
# Long apply function
t6 = time.time()
print(pandas_df.applymap(lambda x: x + 1))
pandas_time = time.time() - t6
print(" stock Pandas wall time for completion in seconds:",pandas_time)

In [None]:
# Long apply function
t7 = time.time()
print(modin_df.applymap(lambda x: x + 1))
modin_time = time.time() - t7
print("Modin wall time for completion in seconds:",modin_time)

In [None]:
print("Modin was {}X faster than stock Pandas!".format(round(pandas_time/modin_time, 2)))

### `pd.concat([df, df])`

In [None]:
# Concat
t8 = time.time()
print(pandas.concat([pandas_df, pandas_df], axis=0))
pandas_time = time.time() - t8
print("stock Pandas wall time for completion in seconds:",pandas_time)

In [None]:
# Concat
t9 = time.time()
print(pd.concat([modin_df, modin_df], axis=0))
modin_time = time.time() - t9
print("Modin wall time for completion in seconds:",modin_time)

In [None]:
print("Modin was {}X faster than stock Pandas!".format(round(pandas_time/modin_time, 2)))

## Modin Coverage Examples 
The Modin package supports a large variety of Pandas functions.
Here are some examples:

### Count

In [None]:
modin_df.count()

### Filter

In [None]:
modin_df.filter(regex='0$', axis=1)

### iloc

In [None]:
modin_df.iloc[0]
modin_df.iloc[-1]
modin_df.iloc[:,0]
modin_df.iloc[:,-1]

## Series

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

### DataFrame to NumPy Array

In [None]:
modin_df.to_numpy()

### Series to NumPy Array

In [None]:
ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
ser.to_numpy(dtype=object)
ser.to_numpy(dtype="datetime64[ns]")

### Set Options

In [None]:
pd.set_option('compute.use_bottleneck', False)
pd.set_option('compute.use_numexpr', False)

### Unique Function for Series

In [None]:
pd.unique(pd.Series([2, 1, 3, 3]))

In [None]:
print("[CODE_SAMPLE_COMPLETED_SUCCESFULLY]")