In [32]:
!pip3 install sklearn_pandas==2.0.4
!pip3 install catboost==0.24.4

In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper

In [2]:
categorical_features = ['feat_5', 'feat_6', 'feat_7', 'feat_8']
numerical_features = ['feat_1', 'feat_2', 'feat_3', 'feat_4']

X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.5])

# Add categorical columns
for col in range(4):
    num_classes = np.random.randint(2, 10)
    cat_col = np.random.randint(num_classes, size=X.shape[0]).reshape(-1,1)
    X = np.hstack((X, cat_col))

# To DataFrame
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

# Scale regressors, modify categoricals
for col in numerical_features:
    mean = np.random.randint(10, 1000)
    std = np.random.randint(1, 100)
    X[col] = X[col].apply(lambda x: mean + std * x).astype(int)

for col in categorical_features:
    X[col] = X[col].apply(lambda x: f'str_{x}' if np.isnan(x)==False else x)

# Create Nans in dataset
for col in categorical_features + numerical_features:
    X[col] = X[col].sample(frac=0.7)
    
df = X.merge(y,left_index=True, right_index=True)

In [3]:
df.sample(3)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,label
8886,446.0,565.0,,,str_6.0,str_1.0,str_0.0,str_0.0,1
1514,473.0,549.0,683.0,898.0,,str_0.0,str_0.0,str_0.0,1
5296,,551.0,,807.0,str_7.0,,str_0.0,str_1.0,1


In [4]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df['label']
X_test, y_test = test_df[categorical_features + numerical_features], test_df['label']

### Preprocessing + Training

In [5]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)]) for c in categorical_features]
num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num + cat, df_out=True)
clf = CatBoostClassifier(iterations=1000,
                         learning_rate=0.01,
                         metric_period=100)

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)

0:	learn: 0.6863287	total: 58.1ms	remaining: 58s
100:	learn: 0.4325719	total: 558ms	remaining: 4.96s
200:	learn: 0.3935335	total: 1.09s	remaining: 4.32s
300:	learn: 0.3793807	total: 2.17s	remaining: 5.04s
400:	learn: 0.3720991	total: 2.7s	remaining: 4.04s
500:	learn: 0.3667096	total: 3.19s	remaining: 3.18s
600:	learn: 0.3618758	total: 3.69s	remaining: 2.45s
700:	learn: 0.3577696	total: 4.18s	remaining: 1.78s
800:	learn: 0.3536301	total: 4.66s	remaining: 1.16s
900:	learn: 0.3500026	total: 5.15s	remaining: 566ms
999:	learn: 0.3457159	total: 5.89s	remaining: 0us


Pipeline(steps=[('preprocess',
                 DataFrameMapper(df_out=True, drop_cols=[],
                                 features=[(['feat_1'], [SimpleImputer()]),
                                           (['feat_2'], [SimpleImputer()]),
                                           (['feat_3'], [SimpleImputer()]),
                                           (['feat_4'], [SimpleImputer()]),
                                           (['feat_5'],
                                            [SimpleImputer(fill_value='UNK',
                                                           strategy='constant'),
                                             OrdinalEncoder()]),
                                           (['feat_6'],
                                            [SimpleImputer(fill_value='UNK',
                                                           strategy='constant'),
                                             OrdinalEncoder()]),
                                           (['f

In [6]:
preprocessed_X_test = mapper.transform(X_test)

In [47]:
X_test[numerical_features + categorical_features].head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8
9000,629.0,,,4.0,,str_0.0,,str_1.0
9001,,,,246.0,str_1.0,str_2.0,str_1.0,str_4.0
9002,795.0,,434.0,,str_1.0,,str_1.0,str_5.0
9003,731.0,969.0,,-7.0,str_1.0,str_5.0,str_0.0,
9004,526.0,1009.0,439.0,,,str_1.0,,str_2.0


In [45]:
preprocessed_X_test[numerical_features + categorical_features].head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8
9000,629.0,984.340446,452.364098,4.0,0.0,1.0,0.0,2.0
9001,636.122757,984.340446,452.364098,246.0,2.0,3.0,2.0,5.0
9002,795.0,984.340446,434.0,75.207028,2.0,0.0,2.0,6.0
9003,731.0,969.0,452.364098,-7.0,2.0,6.0,1.0,0.0
9004,526.0,1009.0,439.0,75.207028,0.0,2.0,0.0,3.0


In [48]:
from joblib import dump, load
dump(pipeline, 'params/pipeline.joblib')
test_df.to_csv('params/test_df.csv')

In [49]:
def evaluation(pipeline, X, y):
    y_predict_proba = pipeline.predict_proba(X)[:, 1]
    return{
        'auc': roc_auc_score(y, y_predict_proba)
    }

In [50]:
evaluation(pipeline, X_train, y_train)

{'auc': 0.9311860182931898}

In [51]:
evaluation(pipeline, X_test, y_test)

{'auc': 0.899970342583241}

### Alternative

In [59]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OneHotEncoder()]) for c in categorical_features]
num = [([n], [SimpleImputer(), StandardScaler()]) for n in numerical_features]
mapper = DataFrameMapper(num + cat, df_out=True)
clf = LogisticRegression()

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 DataFrameMapper(df_out=True, drop_cols=[],
                                 features=[(['feat_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['feat_2'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['feat_3'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['feat_4'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['feat_5'],
                                            [SimpleImputer(fill_value='UNK',
                                            

In [62]:
preprocessed_X_test = mapper.transform(X_test)

In [67]:
X_test[numerical_features + categorical_features].head().T

Unnamed: 0,9000,9001,9002,9003,9004
feat_1,629.0,,795.0,731.0,526.0
feat_2,,,,969.0,1009.0
feat_3,,,434.0,,439.0
feat_4,4.0,246.0,,-7.0,
feat_5,,str_1.0,str_1.0,str_1.0,
feat_6,str_0.0,str_2.0,,str_5.0,str_1.0
feat_7,,str_1.0,str_1.0,str_0.0,
feat_8,str_1.0,str_4.0,str_5.0,,str_2.0


In [66]:
preprocessed_X_test.head().T

Unnamed: 0,9000,9001,9002,9003,9004
feat_1,-0.08120688,0.0,1.811367,1.0817,-1.255515
feat_2,4.815402e-15,4.815402e-15,4.815402e-15,-0.649771,1.044498
feat_3,0.0,0.0,-0.5055161,0.0,-0.367879
feat_4,-0.9907284,2.376303,0.0,-1.143775,0.0
feat_5_x0_UNK,1.0,0.0,0.0,0.0,1.0
feat_5_x0_str_0.0,0.0,0.0,0.0,0.0,0.0
feat_5_x0_str_1.0,0.0,1.0,1.0,1.0,0.0
feat_5_x0_str_2.0,0.0,0.0,0.0,0.0,0.0
feat_5_x0_str_3.0,0.0,0.0,0.0,0.0,0.0
feat_5_x0_str_4.0,0.0,0.0,0.0,0.0,0.0


In [60]:
evaluation(pipeline, X_train, y_train)

{'auc': 0.8813778495237811}

In [61]:
evaluation(pipeline, X_test, y_test)

{'auc': 0.8673832539797047}