In [0]:
# -*- coding: utf-8 -*-
import os
import dataiku
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.externals import joblib

In [0]:
# Recipe inputs
folder_path = dataiku.Folder("SLD4V7Uq").get_path()
df = dataiku.Dataset("test").get_dataframe()

In [0]:
#-----------------------------------------------------------------
# Scikit objects
#-----------------------------------------------------------------
schema = joblib.load(os.path.join(folder_path, 'schema.pkl'))
trf_num = joblib.load(os.path.join(folder_path, 'trf_num.pkl'))
trf_cat = joblib.load(os.path.join(folder_path, 'trf_cat.pkl'))
clf = joblib.load(os.path.join(folder_path, 'model.pkl'))

In [0]:
#-----------------------------------------------------------------
# Transform and score
#-----------------------------------------------------------------

# Preprocess numerical features
x_num = trf_num.transform( df[schema['features_num']] )

# Preprocess categorical features
df_cat = df[schema['features_cat']]
features = df_cat.columns

for feature in features:
    if df_cat[feature].dtype != 'object':
        df_cat[feature] = df_cat[feature].astype(str)
data = df_cat.to_dict(orient='records')

x_cat = trf_cat.transform( data )

In [0]:
# Concatenate
X = np.concatenate((x_cat, x_num), axis=1)

# Actually score the new records
scores = clf.predict_proba(X)

In [0]:
#-----------------------------------------------------------------
# Reshape
#-----------------------------------------------------------------
preds = pd.DataFrame(scores, index=df.index).rename(columns={0: 'proba_False', 1: 'proba_True'})
all_preds = df.join(preds)

In [0]:
# Write recipe outputs
scikit_scored = dataiku.Dataset("scikit_scored")
scikit_scored.write_with_schema(all_preds)