# Kolla om Catboost klarar predict på sparad model och utan pool eller cat_features


In [247]:
import numpy as np
import pickle
import sys
from IPython.display import display
from catboost import CatBoostClassifier
import pandas as pd
pd.set_option('display.width', 200)

In [248]:
# antal hästar per avdeling
def lägg_in_antal_hästar(df_):
    df = df_.copy()
    df['ant_per_lopp'] = None
    df['ant_per_lopp'] = df.groupby(['datum', 'avd'])['avd'].transform('count')
    return df

In [249]:
def remove_features(df_, remove_mer=[]):
    df = df_.copy()
    df.drop(['startnr', 'vodds', 'podds', 'bins', 'h1_dat',
            'h2_dat', 'h3_dat', 'h4_dat', 'h5_dat'], axis=1, inplace=True)
    if remove_mer:
        df.drop(remove_mer, axis=1, inplace=True)

    return df

In [250]:
def prepare_for_catboost(X_):
      X = X_.copy()
      X = remove_features(X, remove_mer=['avd', 'datum'])
      # get numerical features and cat_features
      num_features = list(X.select_dtypes(include=[np.number]).columns)
      cat_features = list(X.select_dtypes(include=['object']).columns)
      if len(num_features)+len(cat_features) != X.shape[1]:
            display('NÅGOT ÄR FEL I num+cat')
      
      print(
          f'num_len={len(num_features)}, cat_len={len(cat_features)}, sum={len(num_features)+len(cat_features)}, len_all={X.shape[1]}')

      # check cat_features isna
      print('NaN in cat before:', X[cat_features].isna().sum()[
            X[cat_features].isna().sum() > 0].sort_values(ascending=False).sum())

      # impute 'missing' for all NaN in cat_features
      X[cat_features] = X[cat_features].fillna('missing')
      print('NaN in cat after:', X[cat_features].isna().sum().sum())
      return X, cat_features


In [251]:
def prepare_for_model(X_):
    X = X_.copy()
    print('Lägg in ant_hästar')
    X = lägg_in_antal_hästar(X)
    return X

## Ladda in data för learn

In [252]:
df_all = pd.read_csv('all_data.csv')
sista_datum =  df_all['datum'].max()
X = df_all[df_all['datum'] != sista_datum]
X.reset_index(inplace=True, drop=True)
y = (X.plac==1).astype(int)
X = X.drop('plac', axis=1)
X.shape

(43978, 78)

In [253]:
### prepare data ###
# X,cat_features=prepare_for_catboost(X.copy())
# print('after prepare_for_catboost')
# print(list(X.columns))
# print(cat_features)

## Learn

In [254]:
def save_model(model):
    with open('kolla.model', 'wb') as f:
        pickle.dump(model, f)

def learn(X_, y, iterations=1000, save=True, verbose=False):
    model = CatBoostClassifier(
         iterations=iterations, loss_function='Logloss', eval_metric='AUC', verbose=verbose)

    X = prepare_for_model(X_)
    X, cat_features = prepare_for_catboost(X)
        
    print(X.shape)    
    model.fit(X, y, cat_features, use_best_model=False)
    
    print('best score', model.best_score_)
    if save:
        save_model(model)


In [255]:
#### test 
# print(X.shape)
# X = prepare_for_model(X.copy())
# X, cat_features = prepare_for_catboost(X)
# X.shape


In [256]:
learn(X, y, iterations=100,save=True, verbose=False)


Lägg in ant_hästar
num_len=54, cat_len=14, sum=68, len_all=68
NaN in cat before: 246
NaN in cat after: 0
(43978, 68)
best score {'learn': {'Logloss': 0.191328639930562}}


## Ladda in data för predict

In [278]:
X_scrape = pd.read_csv('sparad_scrape.csv')
X_scrape = X_scrape[X.columns]
print(np.array(X_scrape.columns), '\n', X_scrape.shape)

['datum' 'avd' 'bana' 'häst' 'kusk' 'streck' 'vodds' 'podds' 'kr' 'spår'
 'dist' 'lopp_dist' 'start' 'ålder' 'kön' 'pris' 'h1_dat' 'h1_kusk'
 'h1_bana' 'h1_spår' 'h1_plac' 'h1_pris' 'h1_odds' 'h1_kmtid' 'h2_dat'
 'h2_kusk' 'h2_bana' 'h2_spår' 'h2_plac' 'h2_pris' 'h2_odds' 'h2_kmtid'
 'h3_dat' 'h3_kusk' 'h3_bana' 'h3_spår' 'h3_plac' 'h3_pris' 'h3_odds'
 'h3_kmtid' 'h4_dat' 'h4_kusk' 'h4_bana' 'h4_spår' 'h4_plac' 'h4_pris'
 'h4_odds' 'h4_kmtid' 'h5_dat' 'h5_kusk' 'h5_bana' 'h5_spår' 'h5_plac'
 'h5_pris' 'h5_odds' 'h5_kmtid' 'h1_dist' 'h2_dist' 'h3_dist' 'h4_dist'
 'h5_dist' 'bins' 'h1_auto' 'h2_auto' 'h3_auto' 'h4_auto' 'h5_auto'
 'h1_perf' 'h2_perf' 'h3_perf' 'h4_perf' 'h5_perf' 'senast' 'delta1'
 'delta2' 'delta3' 'delta4' 'startnr'] 
 (82, 78)


## predict

In [279]:
def load_model(model_name):
    with open(f'{model_name}', 'rb') as f:
        model = pickle.load(f)
    return model

In [280]:
def predict(model,X_):
    X = prepare_for_model(X_)
    X,cat_fetures=prepare_for_catboost(X)
    
    print(list(X.columns),len(X.columns))
    # X.drop('streck', axis=1, inplace=True)

    return model.predict_proba(X)[:, 1]

In [281]:
model=load_model('kolla.model')

In [282]:
X_learn = prepare_for_model(X)
X_learn, cat_features = prepare_for_catboost(X_learn)
X_pred = prepare_for_model(X_scrape)
X_pred, cat_features2 = prepare_for_catboost(X_pred)

print('\n model')
print(list(model.feature_names_), len(model.feature_names_))
print('\n learn')
print(list(X_learn.columns), len(X_learn.columns))
print('\n scrape')
print(list(X_pred.columns), len(X_pred.columns))


Lägg in ant_hästar
num_len=54, cat_len=14, sum=68, len_all=68
NaN in cat before: 246
NaN in cat after: 0
Lägg in ant_hästar
num_len=54, cat_len=14, sum=68, len_all=68
NaN in cat before: 0
NaN in cat after: 0

 model
['bana', 'häst', 'kusk', 'streck', 'kr', 'spår', 'dist', 'lopp_dist', 'start', 'ålder', 'kön', 'pris', 'h1_kusk', 'h1_bana', 'h1_spår', 'h1_plac', 'h1_pris', 'h1_odds', 'h1_kmtid', 'h2_kusk', 'h2_bana', 'h2_spår', 'h2_plac', 'h2_pris', 'h2_odds', 'h2_kmtid', 'h3_kusk', 'h3_bana', 'h3_spår', 'h3_plac', 'h3_pris', 'h3_odds', 'h3_kmtid', 'h4_kusk', 'h4_bana', 'h4_spår', 'h4_plac', 'h4_pris', 'h4_odds', 'h4_kmtid', 'h5_kusk', 'h5_bana', 'h5_spår', 'h5_plac', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_dist', 'h2_dist', 'h3_dist', 'h4_dist', 'h5_dist', 'h1_auto', 'h2_auto', 'h3_auto', 'h4_auto', 'h5_auto', 'h1_perf', 'h2_perf', 'h3_perf', 'h4_perf', 'h5_perf', 'senast', 'delta1', 'delta2', 'delta3', 'delta4', 'ant_per_lopp'] 68

 learn
['bana', 'häst', 'kusk', 'streck', 'kr', 'spår', 

In [283]:
predict(model,X_scrape)

Lägg in ant_hästar
num_len=54, cat_len=14, sum=68, len_all=68
NaN in cat before: 0
NaN in cat after: 0
['bana', 'häst', 'kusk', 'streck', 'kr', 'spår', 'dist', 'lopp_dist', 'start', 'ålder', 'kön', 'pris', 'h1_kusk', 'h1_bana', 'h1_spår', 'h1_plac', 'h1_pris', 'h1_odds', 'h1_kmtid', 'h2_kusk', 'h2_bana', 'h2_spår', 'h2_plac', 'h2_pris', 'h2_odds', 'h2_kmtid', 'h3_kusk', 'h3_bana', 'h3_spår', 'h3_plac', 'h3_pris', 'h3_odds', 'h3_kmtid', 'h4_kusk', 'h4_bana', 'h4_spår', 'h4_plac', 'h4_pris', 'h4_odds', 'h4_kmtid', 'h5_kusk', 'h5_bana', 'h5_spår', 'h5_plac', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_dist', 'h2_dist', 'h3_dist', 'h4_dist', 'h5_dist', 'h1_auto', 'h2_auto', 'h3_auto', 'h4_auto', 'h5_auto', 'h1_perf', 'h2_perf', 'h3_perf', 'h4_perf', 'h5_perf', 'senast', 'delta1', 'delta2', 'delta3', 'delta4', 'ant_per_lopp'] 68


array([0.07860229, 0.03851647, 0.00204686, 0.00474674, 0.15177168,
       0.67336002, 0.02211902, 0.00436347, 0.01010822, 0.01685384,
       0.21205528, 0.0028905 , 0.00410205, 0.08856582, 0.08921163,
       0.19882238, 0.1039676 , 0.14136768, 0.00745322, 0.01672703,
       0.06013517, 0.00129172, 0.00196802, 0.02352489, 0.03334662,
       0.00715742, 0.05330213, 0.13714664, 0.00142833, 0.02274787,
       0.2722359 , 0.00529377, 0.03891716, 0.01248859, 0.26446571,
       0.01768455, 0.03815149, 0.00419669, 0.01341794, 0.06196097,
       0.12064037, 0.01983666, 0.03236538, 0.18333242, 0.02552686,
       0.00372983, 0.01585166, 0.00388272, 0.04330413, 0.00573601,
       0.37484997, 0.00171293, 0.0068203 , 0.0138856 , 0.01033289,
       0.04917606, 0.05021417, 0.10756276, 0.00860758, 0.02424248,
       0.25770757, 0.05886781, 0.0049369 , 0.05772366, 0.01226566,
       0.019222  , 0.06668854, 0.01228357, 0.0550522 , 0.01140287,
       0.00764675, 0.0032765 , 0.07435706, 0.00597178, 0.00762