# JIT-compiling scikit-learn functions inside Bodo

Bodo supports a subset of the functions inside scikit-learn. This allows us to scale out our analysis for larger datasets and larger models.

In [1]:
import bodo
import numpy as np
import pandas as pd
import ipyparallel as ipp
c = ipp.Client(profile="mpi")
view = c[:]
view.activate()
view.block = True
import os
view["cwd"] = os.getcwd()
%px cd $cwd

[stdout:0] /home/dale/Documents/bodo-benchmarks
[stdout:1] /home/dale/Documents/bodo-benchmarks
[stdout:2] /home/dale/Documents/bodo-benchmarks
[stdout:3] /home/dale/Documents/bodo-benchmarks


# Classification

Here we'll make a model classification dataset and train an SVM.

In [2]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=10000, n_features=100, random_state=0)

In [3]:
y[:8]

array([0, 0, 0, 0, 1, 0, 1, 1])

In [4]:
from sklearn.svm import LinearSVC

In [5]:
%%time
estimator = LinearSVC(random_state=0, max_iter=2000)
estimator.fit(X, y)

CPU times: user 5.43 s, sys: 15.5 ms, total: 5.44 s
Wall time: 5.44 s




LinearSVC(max_iter=2000, random_state=0)

In [6]:
estimator.score(X, y)

0.8266

In [7]:
%%time

@bodo.jit
def bodo_svc(X,y):
    estimator = LinearSVC(random_state=0, max_iter=2000)
    estimator.fit(X, y)
bodo_svc(X,y)



CPU times: user 6.24 s, sys: 32 ms, total: 6.27 s
Wall time: 6.36 s




# Clustering

We can do a similar example with KMeans. Here we'll make a test dataset and apply KMeans.

In [8]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

In [9]:
%%time 
clf = KMeans()
n_centers = 5000
n_features = 5000

X_small, y_small = make_blobs(n_samples=1000, centers=n_centers, n_features=n_features, random_state=0)

centers = np.zeros((n_centers, n_features))

for i in range(n_centers):
    centers[i] = X_small[y_small == i].mean(0)

centers = np.nan_to_num(centers)
clf.fit(centers)

  ret = um.true_divide(


CPU times: user 6.33 s, sys: 136 ms, total: 6.47 s
Wall time: 6.49 s


KMeans()

In [10]:
%%px 
%%time
import bodo
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import numpy as np

n_centers = 5000
n_features = 5000
X_small, y_small = make_blobs(n_samples=1000, centers=n_centers, n_features=n_features, random_state=0)
centers = np.zeros((n_centers, n_features))
for i in range(n_centers):
    centers[i] = X_small[y_small == i].mean(0)
centers = np.nan_to_num(centers)

@bodo.jit(distributed=['centers'])
def bodo_kmeans(centers):
    clf = KMeans()
    clf.fit(centers)
    return clf
model = bodo_kmeans(centers)

[stdout:0] 
CPU times: user 48.4 s, sys: 28.2 s, total: 1min 16s
Wall time: 27.7 s
[stdout:1] 
CPU times: user 47.9 s, sys: 26.5 s, total: 1min 14s
Wall time: 27 s
[stdout:2] 
CPU times: user 49.4 s, sys: 29.3 s, total: 1min 18s
Wall time: 28.2 s
[stdout:3] 
CPU times: user 53.2 s, sys: 32.8 s, total: 1min 26s
Wall time: 28.2 s


[stderr:0] 
  ret = um.true_divide(
[stderr:1] 
  ret = um.true_divide(
[stderr:2] 
  ret = um.true_divide(
[stderr:3] 
  ret = um.true_divide(
