In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

import gc
import time

from utils import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pickle
seed=42
PATH = Path('data')
list(PATH.iterdir())

In [None]:
def metric(rf,X_val,y_val):
    y_val_pred = rf.predict_proba(X_val)[:,1]
    return roc_auc_score(y_val,y_val_pred)

def permutation_importances(rf,X_val,y_val,metric):
    baseline = metric(rf,X_val,y_val)
    imp=[]
    for col in X_val.columns:
        save = X_val[col].copy()
        X_val[col] = np.random.permutation(X_val[col])
        m = metric(rf,X_val,y_val)
        print(f'Score after {col} perm: {m:.5f}')
        X_val[col] = save
        imp.append(baseline-m)
    return np.array(imp)

def get_sample(filename,sz):
    df = get_feather(filename)
    sample_idx =np.random.permutation(df.shape[0])
    sample_idx=sorted(sample_idx[:sz])
    df = df.loc[sample_idx,:].reset_index().drop('index',axis=1)
    gc.collect()
    return df

def prediction_score(rf,train_df,y_train,val_df,y_val):
    y_train_pred = rf.predict_proba(train_df)[:,1]
    print(f'Train AUC: {roc_auc_score(y_train, y_train_pred)}')
    y_val_pred = rf.predict_proba(val_df)[:,1]
    val_auc = roc_auc_score(y_val, y_val_pred)
    print(f'Val AUC: {roc_auc_score(y_val, y_val_pred)}')
    return val_auc

In [None]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

val_losses=[]
def score(params):
    print("Training with params: ")
    print(params)
    
    rf = RandomForestClassifier(oob_score=False,n_jobs=-1,class_weight={1:100},random_state=seed,**params)
    rf.fit(train_df,y_train)
    loss=prediction_score(rf,train_df,y_train,val_df,y_val)
    val_losses.append(loss)
    del rf
    gc.collect()
    return {'loss': loss, 'status': STATUS_OK}

def optimize(space,max_evals=5):
    
    best = fmin(score, space, algo=tpe.suggest, 
        # trials=trials, 
        max_evals=max_evals)
    return best