# Impact of null values and new values when using CatBoost
Tests whether it is possible to fit and predict (using Catboost) when there are nulls in the data.  
Findings should be clear from the markdown text, so it should be possible to ignore the code, if desired.

In [38]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
import catboost as cb

# Settings
pd.options.display.max_columns=150

In [110]:
bunch = load_breast_cancer()

df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
df.columns = [c.replace(' ', '_') for c in list(df)]
df['y'] = bunch.target

#### Create test data

# Test fields
df['cat_no_nulls'] = pd.cut(df['worst_concave_points'], [-1, 0.06, 0.1, 2], False, ['Low', 'Medium', 'High'])
df['str_no_nulls'] = pd.cut(df['worst_concave_points'], [-1, 0.06, 0.1, 2], False, ['Low', 'Medium', 'High']).astype(str)
df['float_no_nulls'] = df['worst_concave_points']

df['samp'] = 'train'
df.loc[df.index > 300, 'samp'] = 'test'

# Test fields with nulls
df['cat_with_nulls'] = pd.cut(df['worst_concave_points'], [-1, 0.06, 0.1, 2], False, ['Low', 'Medium', 'High'])
df['str_with_nulls'] = pd.cut(df['worst_concave_points'], [-1, 0.06, 0.1, 2], False, ['Low', 'Medium', 'High']).astype(str)
df['float_with_nulls'] = df['worst_concave_points']

df.loc[(df['samp'] == 'test') & (df.index > 560),
       ['cat_with_nulls', 'str_with_nulls', 'float_with_nulls']] = np.NaN

train = df.query('samp == "train"')
test = df.query('samp == "test"')

y = 'y'

## Tests

### Without nulls

#### Float
Runs just fine.

In [111]:
# Float
X = ['float_no_nulls']
clf = cb.CatBoostClassifier(logging_level='Silent')
clf.fit(train[X], train[y])
temp = clf.predict_proba(train[X])
# Runs with no error

#### Str
String features can only be used if the variable is specified as a categorical in the cb fit.

In [112]:
# Str
try:
    X = ['str_no_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(train[X], train[y])
    temp = clf.predict_proba(train[X])
except:
    print('Fails.')
#     raise
# The fit fails with a type exception

Fails.


In [113]:
# Str, marked as a categorical in the cb fit.
try:
    X = ['str_no_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(train[X], train[y], cat_features=[0])
    temp = clf.predict_proba(train[X])
except:
    print('Fails.')
# Runs successfully

#### Cat
Cat features (i.e. features that are explicitly set as Categorical dtype) can only be used if the variable is specified as a categorical in the cb fit.

In [114]:
try:
    X = ['cat_no_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(train[X], train[y])
    temp = clf.predict_proba(train[X])
except:
    print('Fails.')
#     raise
# The fit fails with a type exception

Fails.


In [115]:
# Cat, marked as a categorical in the cb fit.
try:
    X = ['cat_no_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(train[X], train[y], cat_features=[0])
    temp = clf.predict_proba(train[X])
except:
    print('Fails.')
# Runs successfully

### With nulls

#### Float
Able to train and test even if there are null values.  

The NaNs might be getting handled as categoricals. If there were nulls in the training set, the prediction seems to reflect the relationship between the nulls (in the independent) and the dependent variable. If there are no nulls in the training set, the prediction seems to be quite negative (i.e. high probability of event class).

In [116]:
# Float
X = ['float_with_nulls']
clf = cb.CatBoostClassifier(logging_level='Silent')
clf.fit(test[X], test[y])
df['pred_prob'] = clf.predict_proba(df[X])[:, 1]
# Runs with no error

In [121]:
df[X + [y, 'pred_prob']].tail(15)

Unnamed: 0,float_with_nulls,y,pred_prob
554,0.06493,1,0.990017
555,0.09127,1,0.971069
556,0.02232,1,0.993857
557,0.0,1,0.980513
558,0.1105,1,0.931677
559,0.09653,1,0.974008
560,0.1048,1,0.928922
561,,1,0.263606
562,,0,0.263606
563,,0,0.263606


In [122]:
# Float
X = ['float_with_nulls']
clf = cb.CatBoostClassifier(logging_level='Silent')
clf.fit(train[X], train[y])
df['pred_prob'] = clf.predict_proba(df[X])[:, 1]
# Runs with no error

In [123]:
df[X + [y, 'pred_prob']].tail(15)

Unnamed: 0,float_with_nulls,y,pred_prob
554,0.06493,1,0.980924
555,0.09127,1,0.87224
556,0.02232,1,0.967813
557,0.0,1,0.986941
558,0.1105,1,0.553797
559,0.09653,1,0.86291
560,0.1048,1,0.889119
561,,1,0.986941
562,,0,0.986941
563,,0,0.986941


In [125]:
df['pred_prob'].describe()

count    569.000000
mean       0.597156
std        0.411943
min        0.000973
25%        0.113664
50%        0.869972
75%        0.967813
max        0.991313
Name: pred_prob, dtype: float64

#### Str
For strings, NaN values are not allowed for either the fit, or the predict.

In [129]:
# Str, marked as a categorical in the cb fit.
try:
    X = ['str_with_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(test[X], test[y], cat_features=[0])
    temp = clf.predict_proba(test[X])
except:
    print('Fails.')
#     raise
# Fails with cb error
# "Invalid type for cat_feature[1,0]=nan : cat_features must be integer or string, real number values and NaN values should be converted to string."

Fails.


In [130]:
# Str, marked as a categorical in the cb fit.
try:
    X = ['str_with_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(train[X], train[y], cat_features=[0])
    temp = clf.predict_proba(test[X])
except:
    print('Fails.')
#     raise
# Fails with cb error
# "Invalid type for cat_feature[1,0]=nan : cat_features must be integer or string, real number values and NaN values should be converted to string."

Fails.


#### Cat
Same results as str. NaN values are not allowed for either the fit, or the predict.

In [139]:
try:
    X = ['cat_with_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(test[X], test[y], cat_features=[0])
    df['pred_prob'] = clf.predict_proba(df[X])
except:
    print('Fails.')
#     raise
# Fails with cb error
# "Invalid type for cat_feature[1,0]=nan : cat_features must be integer or string, real number values and NaN values should be converted to string."

Fails.


In [142]:
try:
    X = ['cat_with_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(train[X], train[y], cat_features=[0])
    df['pred_prob'] = clf.predict_proba(df[X])
except:
    print('Fails.')
#     raise
# Fails with cb error
# "Invalid type for cat_feature[1,0]=nan : cat_features must be integer or string, real number values and NaN values should be converted to string."

Fails.


In [89]:
# Cat, marked as a categorical in the cb fit.
try:
    X = ['cat_no_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(train[X], train[y], cat_features=[0])
    temp = clf.predict_proba(train[X])
except:
    print('Fails.')
# Runs successfully

### With new values in the predict
Similar to the behaviour for float where there were no nulls in the fit, but there were nulls in the fit.  
When there is a new categorical value during predict, cb is still able to generate a prediction. The predicted value differs from the other predictions - it would seem that the generated model has some "view" on what to do with new values.

In [145]:
df.loc[df.index > 560, 'str_no_nulls'] = 'new'

# Str, marked as a categorical in the cb fit.
try:
    X = ['str_no_nulls']
    clf = cb.CatBoostClassifier(logging_level='Silent')
    clf.fit(train[X], train[y], cat_features=[0])
    df['pred_prob'] = clf.predict_proba(df[X])
except:
    print('Fails.')
# Runs successfully

In [147]:
df[['samp'] + X + [y, 'pred_prob']].tail(15)

Unnamed: 0,samp,str_no_nulls,y,pred_prob
554,test,Medium,1,0.09429
555,test,Medium,1,0.09429
556,test,Low,1,0.02813
557,test,Low,1,0.02813
558,test,High,1,0.774501
559,test,Medium,1,0.09429
560,test,High,1,0.774501
561,test,new,1,0.030199
562,test,new,0,0.030199
563,test,new,0,0.030199
