In [0]:
# -*- coding: utf-8 -*-
import os
import dataiku
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV
#from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [0]:
# Recipe inputs
df = dataiku.Dataset("train").get_dataframe()

df.head(2)

In [0]:
df.columns

In [0]:
#-----------------------------------------------------------------
# Settings
#-----------------------------------------------------------------
SCHEMA = {
    'target': 'Churn',
    'features_num': [
        'Account Length', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins',
        'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins',
        'Intl Calls', 'Intl Charge', 'CustServ Calls'
    ],
    'features_cat': [
        'State', 'Area Code', 'Intl Plan', 'VMail Plan'
    ]
}

In [0]:
#-----------------------------------------------------------------
# Preprocessing
#-----------------------------------------------------------------

# Numerical variables
df_num = df[SCHEMA['features_num']]

trf_num = Pipeline([
    ('imp', Imputer(strategy='median')),
    ('sts', StandardScaler()),
])

x_num = trf_num.fit_transform(df_num)

# Categorical variables
df_cat = df[SCHEMA['features_cat']]
features = df_cat.columns

for feature in features:
    if df_cat[feature].dtype != 'object':
        df_cat[feature] = df_cat[feature].astype(str)

data = df_cat.to_dict(orient='records')

trf_cat = DictVectorizer(sparse=False)
x_cat = trf_cat.fit_transform(data)

# Concat
X = np.concatenate((x_cat, x_num), axis=1)
Y = df[SCHEMA['target']].values

In [0]:
#-------------------------------------------------------------------------
# TRAINING
#-------------------------------------------------------------------------

param_grid = {
    "max_depth"        : [3, None],
    "max_features"     : [1, 3, 10],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf" : [1, 3, 10],
    "bootstrap"        : [True, False],
    "criterion"        : ["gini", "entropy"]
}

clf = RandomForestClassifier()
gs = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, scoring='roc_auc')
gs.fit(X, Y)

In [0]:
# Recipe outputs
model_scikit = dataiku.Folder("SLD4V7Uq").get_path()

for file in os.listdir(model_scikit):
    try: os.remove(file)
    except: pass

In [0]:
serials = [
    {'pkl': 'schema.pkl', 'obj': SCHEMA},
    {'pkl': 'trf_num.pkl', 'obj': trf_num},
    {'pkl': 'trf_cat.pkl', 'obj': trf_cat},
    {'pkl': 'model.pkl', 'obj': gs.best_estimator_},
]

In [0]:
for serial in serials:
    fp = os.path.join(model_scikit, serial['pkl'])
    joblib.dump(serial['obj'], fp)