In [1]:
from pathlib import Path
import json
from sklearn_porter import Porter
import json
from json import encoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
data = {}
p = Path('data')
for datafile in p.iterdir():
    with datafile.open('r') as f:
        d = json.load(f)
        data[d['name']] = d['dataPoints']

In [9]:
def get_pipeline(clf=None):
    if clf is None:
        clf = RandomForestClassifier(n_estimators=200,
                                     oob_score=True,
                                     max_depth=None,
                                     min_samples_split=2,
                                     max_features='auto')
    return make_pipeline(DictVectorizer(sparse=False), clf)

def format_training_data(data):
    X = []
    y = []
    for name, room in data.items():
        X.extend([aps_to_dict(data_point) for data_point in room])
        y.extend([name] * len(room))
    return X, y

def aps_to_dict(aps):
    return {ap['mac']: ap['rssi'] for ap in aps}

In [10]:
# pipeline = get_pipeline(clf=MLPClassifier(alpha=1, solver='lbfgs'))
# pipeline = get_pipeline(clf=AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100,
#                                                                                      oob_score=True,
#                                                                                      max_depth=None,
#                                                                                      min_samples_split=2,
#                                                                                      max_features='sqrt'),
#                                                                                      n_estimators=200))
pipeline = get_pipeline()
pipeline.steps

[('dictvectorizer',
  DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
          sparse=False)),
 ('randomforestclassifier',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
              oob_score=True, random_state=None, verbose=0, warm_start=False))]

In [11]:
%%time

X, y = format_training_data(data)
pipeline.fit(X, y)

Wall time: 401 ms


In [12]:
%%time

X, y = format_training_data(data)
scores = cross_val_score(pipeline, X, y)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.96 (+/- 0.09)
Wall time: 1.07 s


In [7]:
%%time

param_grid = { 
    'randomforestclassifier__n_estimators': [20, 30, 40, 50, 75, 100, 150, 200],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2']
}
cv = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=20, n_jobs=-1)
X, y = format_training_data(data)
cv.fit(X, y)
best = cv.best_estimator_
print(cv.best_params_)

{'randomforestclassifier__max_features': 'auto', 'randomforestclassifier__n_estimators': 20}
Wall time: 32.5 s


In [8]:
scores = cross_val_score(best, X, y)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.91 (+/- 0.09)


In [13]:
%%time

porter = Porter(pipeline, language='java')
output = porter.export(method='predict', export_data=True)
with open('data.json', 'r') as f:
    out = json.load(f)
out = {
    'features': pipeline.steps[0][1].get_feature_names(),
    'classes': list(pipeline._final_estimator.classes_),
    'forest': out
}
with open('data.json', 'w') as f:
    json.dump(out, f, separators=(',', ':'))
# print(output) # Show java source code

Wall time: 1.11 s
