# Comparison to sklearn estimators

Scikit-Learn is a popular machine learning library, which has been tested many times in practical applications. If the current library can match scikit - learn in it's performance, this is a good indicator of quality of the library. For such comparison, the benchmarks in this notebook are given.

In [1]:
from subprocess import call, DEVNULL
import numpy as np
import pmlb
import json
from time import time
import os
import random

## Execute estimator in javascript

As the names of the parameters and estimators mirror those of sklearn, it is relatively easy to execute the estimators in javascript. Implementation is given below.

In [2]:
def compare_estimator(X, y, X_test, y_test, estimator, params):
    node_code = """
const ai = require('../src/automljs')
const fs = require('fs')
var data = JSON.parse(fs.readFileSync('data.json', 'utf8'));

async function main(){
    var X = data['X'];
    var y = data['y'];
    var X_test = data['X_test'];
    var y_test = data['y_test'];

    // read estimator from the serialization module
    var model = new ai.io.base_estimators['%s'](%s)

    var fit_start = process.hrtime();
    await model.fit(X, y)
    var elapsed = process.hrtime(fit_start)[1] / 1000000; // divide by a million to get nano to milli

    var score = await model.score(X_test, y_test)
    var res = {
        'score': score, 'runtime': elapsed
    }
    await fs.writeFile('result.json', JSON.stringify(res), 'utf8', function(){ })
}

main()
    """ % (
        estimator.__class__.__name__, 
        json.dumps(params)
        )
    
    with open('./script.js', 'w') as s:
        s.write(node_code)
    
    with open('./data.json', 'w') as d:
        json.dump({
            'X': X.tolist(),
            'y': y.tolist(),
            'X_test': X_test.tolist(),
            'y_test': y_test.tolist(),
        }, d)
    
    
    call(['node script.js'], shell=True)
    
    result = None
    with open('./result.json', 'r') as js:
        javascript = json.load(js)
    
    estimator.set_params(**params)
    
    start = time()
    estimator.fit(X, y)
    elapsed = (time() - start)*1000.0 # miliseconds

    # clean up
    os.remove('./script.js')
    os.remove('./result.json')
    os.remove('./data.json')

    return {
        'python_score': estimator.score(X_test, y_test),
        'python_runtime': elapsed,
        'javascript_score': javascript['score'],
        'javascript_runtime': javascript['runtime']
    }

## Benchmarks!

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import make_pipeline

from tqdm import tqdm
from itertools import product
from pprint import pprint

results = []

estimator = SGDClassifier()
params = {'max_iter':250, 'l1_ratio': 0.0, 'alpha': 1.0}
datasets = pmlb.classification_dataset_names

max_size = 1000

#datasets = ['4544_GeographicalOriginalofMusic', '505_tecator']
use_datas = []
for name in datasets:
    if name in {'1191_BNG_pbc', '1595_poker'}:
        continue
    X, y = pmlb.fetch_data(name, True)
    print(name, X.shape)

    if len(y) > max_size:
        X = X[:max_size]
        y = y[:max_size]
    
    if(len(set(y)) < 2):
        continue
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    feats = make_pipeline(
        Imputer(),
        StandardScaler()
    )
    X_train = feats.fit_transform(X_train)
    X_test = feats.transform(X_test)

    result = compare_estimator(X_train, y_train, X_test, y_test, estimator, params)
    pprint(result, width=40)
    results.append(result)
    use_datas.append(name)

GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1 (1600, 1000)
{'javascript_runtime': 73.982037,
 'javascript_score': 0.784,
 'python_runtime': 776.3168811798096,
 'python_score': 0.8}
GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1 (1600, 20)
{'javascript_runtime': 539.087968,
 'javascript_score': 0.8,
 'python_runtime': 30.742168426513672,
 'python_score': 0.8}
GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1 (1600, 20)
{'javascript_runtime': 532.397127,
 'javascript_score': 0.8,
 'python_runtime': 30.88831901550293,
 'python_score': 0.8}
GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1 (1600, 20)
{'javascript_runtime': 566.301032,
 'javascript_score': 0.8,
 'python_runtime': 29.201030731201172,
 'python_score': 0.8}
GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001 (1600, 20)
{'javascript_runtime': 541.109707,
 'javascript_score': 0.8,
 'python_runtime': 30.64894676208496,
 'python_score': 0.8}
GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM-2_001 (1600, 20)
{'javascript_runtime':

KeyboardInterrupt: 

In [4]:
import pandas as pd  
from IPython.display import Image, display

df = pd.DataFrame(results)

print('JS / PY score ratio:')

display((df['javascript_score'] / df['python_score']).mean())
display((df[['javascript_score', 'python_score']]).mean())

print('JS / PY runtime ratio:')
display((df['javascript_runtime'] / df['python_runtime']).mean())

print(np.max(df['javascript_score'] - df['python_score']))
print(np.min(df['javascript_score'] - df['python_score']))

JS / PY score ratio:


1.0043370648860004

javascript_score    0.375215
python_score        0.375460
dtype: float64

JS / PY runtime ratio:


6.576450644056651

0.007291142679533791
-0.009394125528433106
