# Baseline Model Pipeline

By: Aditya Mengani, Ognjen Sosa, Sanjay Elangovan, Song Park, Sophia Skowronski

**Can we improve on the baseline scores using different encoding, imputing, and scaling schemes?**
- Averaged Logistic Regression accuracy Score: 0.5
- Averaged Linear Regression accuracy score: 0.2045
- Averaged K-Nearest Neighbour accuracy score: 0.6198
- Averaged Naive Bayes accuracy score: 0.649

**`p1_tag` ~  `rank` + `total_funding_usd` + `employee_count` (ordinal) + `country` (nominal) + `category_groups` (nominal)**

In [1]:
'''Data analysis'''
import numpy as np
import pandas as pd
import csv
import warnings
import os
import time
import math
import itertools
import statistics

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

'''Stat'''
import statsmodels.api as sm
from scipy.stats import chi2_contingency

'''ML'''
import prince
import category_encoders as ce
from sklearn import metrics, svm, preprocessing, utils
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model  import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100*(start_mem-end_mem)/start_mem))
    return df

## Reading in data

In [3]:
df = pd.read_csv('files/output/baseline.csv')
print('Starting Dataframe Columns:\n\n{}\n'.format(df.columns.to_list()))

# Have industry mapper for 'ind_1'...'ind_46' columns
industries = ['Software', 'Information Technology', 'Internet Services', 'Data and Analytics',
              'Sales and Marketing', 'Media and Entertainment', 'Commerce and Shopping', 
              'Financial Services', 'Apps', 'Mobile', 'Science and Engineering', 'Hardware',
              'Health Care', 'Education', 'Artificial Intelligence', 'Professional Services', 
              'Design', 'Community and Lifestyle', 'Real Estate', 'Advertising',
              'Transportation', 'Consumer Electronics', 'Lending and Investments',
              'Sports', 'Travel and Tourism', 'Food and Beverage',
              'Content and Publishing', 'Consumer Goods', 'Privacy and Security',
              'Video', 'Payments', 'Sustainability', 'Events', 'Manufacturing',
              'Clothing and Apparel', 'Administrative Services', 'Music and Audio',
              'Messaging and Telecommunications', 'Energy', 'Platforms', 'Gaming',
              'Government and Military', 'Biotechnology', 'Navigation and Mapping',
              'Agriculture and Farming', 'Natural Resources']
industry_map = {industry:'ind_'+str(idx+1) for idx,industry in enumerate(industries)}

# Create 
df_simple = df[['p1_tag', 'rank', 'country', 'employee_size', 'category_groups', 'total_funding_usd']]
df_simple = reduce_mem_usage(df_simple)

print('\nEnding Dataframe Columns:\n\n{}'.format(df_simple.columns.to_list()))

print('\nDataframe shape:', df_simple.shape)

del industries, industry_map

Starting Dataframe Columns:

['ind_enc_1', 'ind_enc_2', 'ind_enc_3', 'ind_enc_4', 'ind_enc_5', 'ind_enc_6', 'ind_enc_7', 'ind_enc_8', 'ind_enc_9', 'ind_enc_10', 'ind_enc_11', 'ind_enc_12', 'ind_enc_13', 'ind_enc_14', 'ind_enc_15', 'ind_enc_16', 'ind_enc_17', 'ind_enc_18', 'ind_enc_19', 'ind_enc_20', 'ind_enc_21', 'ind_enc_22', 'ind_enc_23', 'ind_enc_24', 'ind_enc_25', 'ind_enc_26', 'ind_enc_27', 'ind_enc_28', 'ind_enc_29', 'ind_enc_30', 'ind_enc_31', 'ind_enc_32', 'ind_enc_33', 'ind_enc_34', 'ind_enc_35', 'ind_enc_36', 'ind_enc_37', 'ind_enc_38', 'ind_enc_39', 'ind_enc_40', 'ind_enc_41', 'ind_enc_42', 'ind_enc_43', 'ind_enc_44', 'ind_enc_45', 'ind_enc_46', 'country_enc_1', 'country_enc_2', 'country_enc_3', 'country_enc_4', 'country_enc_5', 'country_enc_6', 'country_enc_7', 'country_enc_8', 'country_enc_9', 'country_enc_10', 'country_enc_11', 'country_enc_12', 'country_enc_13', 'country_enc_14', 'country_enc_15', 'employee_size', 'category_groups', 'country', 'p1_tag', 'rank', 'employee

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.int8)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)


In [4]:
from datetime import datetime

In [6]:
###########################
# Pledge 1% Company UUIDs #
###########################
print('*'*100)

p1 = pd.read_csv('files/p1.csv')
print('PLEDGE 1% cols: {}\nSHAPE: {}\n'.format(p1.columns.to_list(), p1.shape))

#################
# Organizations #
#################
print('*'*100)

org = pd.read_csv('files/csv/organizations.csv')
print('ORGANIZATION cols: {}\nSHAPE: {}\n'.format(org.columns.to_list(), org.shape))

# Merge p1 and org dataframes on the organization uuid
df = pd.merge(org.copy(),p1.copy(),how='outer',on='uuid')

# Convert Boolean to binary
df['p1_tag'] = df['p1_tag'].apply(lambda x: 1 if x == True else 0)
p1['p1_tag'] = 1

# Convert employee_count 'unknown' to NaN to get accurate missing value count
df['employee_count'] = df['employee_count'].apply(lambda x: np.NaN if x == 'unknown' else x)

# Review Pandas Profiling Report of dataframe & update columns
df = df[['uuid','name','rank','status','employee_count','total_funding_usd','num_funding_rounds','primary_role','region','country_code','category_list','category_groups_list','founded_on','created_at','updated_at','p1_date','p1_tag']]

##############
# Timestamps #
##############

# Convert to datetime objects
df['p1_date'] = pd.to_datetime(df['p1_date'])
p1['p1_date'] = pd.to_datetime(p1['p1_date'])

# Get OutOfBoundsDatetime error if do not coerce for CB native timestamp columns
df['created_at'] = pd.to_datetime(df['created_at'],errors='coerce').dt.strftime('%Y-%m-%d')
df['updated_at'] = pd.to_datetime(df['updated_at'],errors='coerce').dt.strftime('%Y-%m-%d')
df['founded_on'] = pd.to_datetime(df['founded_on'],errors='coerce')

# Reduce storage for numerical features
df = reduce_mem_usage(df)

# Create new pledge1 dataframe that sorts by chronological order that the company took the pledge
pledge1 = df[df['p1_tag'] == 1].sort_values('p1_date')

#Get age of each company
now = datetime.now().date()
df['founded_on2'] = pd.to_datetime(df['founded_on']).dt.date
df['founded_on2'].fillna(now, inplace = True)

age = []
for i in range (len(df['founded_on'])):
    age.append(round(((now - df['founded_on2'][i]).days)/365,3))
    
age_series = pd.Series(age)
df['age'] = age_series

print(f"There are {df['age'].value_counts().get(0)} entries with no founded_on date. Let's remove these from the dataset.")
df['age'].replace(0, None, inplace=True)
print(f"Now there are {df['age'].value_counts().get(0)} with the value of 0.")

df_simple['age'] = df['age']

****************************************************************************************************
PLEDGE 1% cols: ['uuid', 'p1_tag', 'p1_date']
SHAPE: (7822, 3)

****************************************************************************************************


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


ORGANIZATION cols: ['uuid', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at', 'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url', 'country_code', 'state_code', 'region', 'city', 'address', 'postal_code', 'status', 'short_description', 'category_list', 'category_groups_list', 'num_funding_rounds', 'total_funding_usd', 'total_funding', 'total_funding_currency_code', 'founded_on', 'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone', 'facebook_url', 'linkedin_url', 'twitter_url', 'logo_url', 'alias1', 'alias2', 'alias3', 'primary_role', 'num_exits']
SHAPE: (1131310, 41)

Mem. usage decreased to 132.71 Mb (14.6% reduction)
There are 252682 entries with no founded_on date. Let's remove these from the dataset.
Now there are None with the value of 0.


In [8]:
# Select equal sample of non-Pledge 1% organizations
df_p1 = df_simple[df_simple['p1_tag']==1]
df_notp1 = df_simple[df_simple['p1_tag']==0].sample(n=df_p1.shape[0], replace=False)
df_model = pd.concat([df_p1, df_notp1]).reset_index(drop=True)
df_model = reduce_mem_usage(df_model)

# Create variable for each feature type: categorical and numerical
numeric_features = df_model.select_dtypes(include=['int8', 'int16', 'int32', 'int64', 'float16', 'float32','float64']).drop(['p1_tag'], axis=1).columns
categorical_features = df_model.select_dtypes(include=['object']).columns
print('Numeric features:', numeric_features.to_list())
print('Categorical features:', categorical_features.to_list())

X = df_model.drop('p1_tag', axis=1)
y = df_model['p1_tag']
y = preprocessing.LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('Training data shape:', X_train.shape)
print('Train label shape:', y_train.shape)
print('Test data shape:',  X_test.shape)
print('Test label shape:', y_test.shape)

Mem. usage decreased to  0.52 Mb (14.6% reduction)
Numeric features: ['rank', 'total_funding_usd', 'age']
Categorical features: ['country', 'employee_size', 'category_groups']
Training data shape: (12531, 6)
Train label shape: (12531,)
Test data shape: (3133, 6)
Test label shape: (3133,)


#### Run through pipeline to determine best categorical feature encoder

From: <a href='https://towardsdatascience.com/an-easier-way-to-encode-categorical-features-d840ff6b3900'>An Easier Way to Encode Categorical Features</a>

In [5]:
results = {}
classifier_list = []
LRR = LogisticRegression(max_iter=10000, tol=0.1)
KNN = KNeighborsClassifier(n_neighbors=30, p=1, leaf_size=25)
BNB = BernoulliNB()
GNB = GaussianNB()
classifier_list.append(('LRR', LRR, {'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000]}))
classifier_list.append(('KNN', KNN, {}))
classifier_list.append(('BNB', BNB, {'classifier__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]}))
classifier_list.append(('GNB', GNB, {'classifier__var_smoothing': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]}))

#classifier_list.append(('SVM', svm.SVC()))
#classifier_list.append(('CART', DecisionTreeClassifier()))
#classifier_list.append(('LDA', LinearDiscriminantAnalysis()))

encoder_list = [ce.backward_difference.BackwardDifferenceEncoder, 
                ce.basen.BaseNEncoder,
                ce.binary.BinaryEncoder,
                ce.cat_boost.CatBoostEncoder,
                ce.hashing.HashingEncoder,
                ce.helmert.HelmertEncoder,
                ce.james_stein.JamesSteinEncoder,
                ce.one_hot.OneHotEncoder,
                ce.leave_one_out.LeaveOneOutEncoder,
                ce.m_estimate.MEstimateEncoder,
                ce.ordinal.OrdinalEncoder,
                ce.polynomial.PolynomialEncoder,
                ce.sum_coding.SumEncoder,
                ce.target_encoder.TargetEncoder,
                ce.woe.WOEEncoder]

for label, classifier, params in classifier_list:
    results[label] = {}
    for encoder in encoder_list:
        results[label][encoder.__name__] = {}
        print('{} with {}'.format(label, encoder.__name__))
        
        #numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', MinMaxScaler())])
        numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])

        categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                                  ('woe', encoder())])

        preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                       ('cat', categorical_transformer, categorical_features)])
        pipe = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)])
        if params != {}:
            try:
                search = GridSearchCV(pipe, params, n_jobs=-1)
                search.fit(X_train, y_train)
                print('Best parameter (CV score={:.3f}): {}'.format(search.best_score_, search.best_params_))
                model = search.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                score = f1_score(y_test, y_pred)
                print('Best score: {:.4f}\n'.format(score))
                results[label][encoder.__name__]['score'] = score
                results[label][encoder.__name__]['best_params'] = search.best_params_
            except:
                print('Something went wrong w/ GridSearch or pipeline fitting.')
        else:
            try:
                model = pipe.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                score = f1_score(y_test, y_pred)
                print('Score: {:.4f}\n'.format(score))
                results[label][encoder.__name__]['score'] = score
            except:
                print('Something went wrong with pipeline fitting')

LRR with BackwardDifferenceEncoder
Best parameter (CV score=0.712): {'classifier__C': 1.0}
Best score: 0.7234

LRR with BaseNEncoder
Best parameter (CV score=0.676): {'classifier__C': 0.1}
Best score: 0.6802

LRR with BinaryEncoder
Best parameter (CV score=0.676): {'classifier__C': 0.1}
Best score: 0.6802

LRR with CatBoostEncoder
Best parameter (CV score=0.705): {'classifier__C': 100.0}
Best score: 0.7078

LRR with HashingEncoder
Best parameter (CV score=nan): {'classifier__C': 0.0001}
Best score: 0.6505

LRR with HelmertEncoder
Best parameter (CV score=0.700): {'classifier__C': 100.0}
Best score: 0.7103

LRR with JamesSteinEncoder
Best parameter (CV score=0.695): {'classifier__C': 1.0}
Best score: 0.6991

LRR with OneHotEncoder
Best parameter (CV score=0.717): {'classifier__C': 1.0}
Best score: 0.7232

LRR with LeaveOneOutEncoder
Best parameter (CV score=0.704): {'classifier__C': 1000}
Best score: 0.7082

LRR with MEstimateEncoder
Best parameter (CV score=0.688): {'classifier__C': 0.

  raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1))


Something went wrong w/ GridSearch or pipeline fitting.
LRR with SumEncoder
Best parameter (CV score=0.717): {'classifier__C': 1.0}
Best score: 0.7234

LRR with TargetEncoder
Best parameter (CV score=0.698): {'classifier__C': 1.0}
Best score: 0.7062

LRR with WOEEncoder
Best parameter (CV score=0.702): {'classifier__C': 0.001}
Best score: 0.7095

KNN with BackwardDifferenceEncoder
Score: 0.6593

KNN with BaseNEncoder
Score: 0.6908

KNN with BinaryEncoder
Score: 0.6908

KNN with CatBoostEncoder
Score: 0.7091

KNN with HashingEncoder
Score: 0.6434

KNN with HelmertEncoder
Score: 0.6691

KNN with JamesSteinEncoder
Score: 0.6922

KNN with OneHotEncoder
Score: 0.7093

KNN with LeaveOneOutEncoder
Score: 0.7061

KNN with MEstimateEncoder
Score: 0.6879

KNN with OrdinalEncoder
Score: 0.6533

KNN with PolynomialEncoder


  raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1))


Something went wrong with pipeline fitting
KNN with SumEncoder
Score: 0.7096

KNN with TargetEncoder
Score: 0.7100

KNN with WOEEncoder
Score: 0.7138

BNB with BackwardDifferenceEncoder
Best parameter (CV score=0.508): {'classifier__alpha': 0.0001}
Best score: 0.2665

BNB with BaseNEncoder
Best parameter (CV score=0.657): {'classifier__alpha': 10.0}
Best score: 0.6556

BNB with BinaryEncoder
Best parameter (CV score=0.657): {'classifier__alpha': 10.0}
Best score: 0.6556

BNB with CatBoostEncoder
Best parameter (CV score=0.621): {'classifier__alpha': 0.0001}
Best score: 0.6344

BNB with HashingEncoder
Best parameter (CV score=nan): {'classifier__alpha': 0.0001}
Best score: 0.5818

BNB with HelmertEncoder
Best parameter (CV score=0.699): {'classifier__alpha': 1.0}
Best score: 0.6991

BNB with JamesSteinEncoder
Best parameter (CV score=0.619): {'classifier__alpha': 0.0001}
Best score: 0.6237

BNB with OneHotEncoder
Best parameter (CV score=0.698): {'classifier__alpha': 1.0}
Best score: 0.

  raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1))


Something went wrong w/ GridSearch or pipeline fitting.
BNB with SumEncoder
Best parameter (CV score=0.698): {'classifier__alpha': 1.0}
Best score: 0.7000

BNB with TargetEncoder
Best parameter (CV score=0.621): {'classifier__alpha': 0.0001}
Best score: 0.6344

BNB with WOEEncoder
Best parameter (CV score=0.671): {'classifier__alpha': 10.0}
Best score: 0.6849

GNB with BackwardDifferenceEncoder
Best parameter (CV score=0.550): {'classifier__var_smoothing': 0.0001}
Best score: 0.4207

GNB with BaseNEncoder
Best parameter (CV score=0.663): {'classifier__var_smoothing': 0.1}
Best score: 0.6194

GNB with BinaryEncoder
Best parameter (CV score=0.663): {'classifier__var_smoothing': 0.1}
Best score: 0.6194

GNB with CatBoostEncoder
Best parameter (CV score=0.691): {'classifier__var_smoothing': 0.0001}
Best score: 0.6510

GNB with HashingEncoder
Best parameter (CV score=nan): {'classifier__var_smoothing': 0.0001}
Best score: 0.5445

GNB with HelmertEncoder
Best parameter (CV score=0.528): {'cl

  raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1))


Something went wrong w/ GridSearch or pipeline fitting.
GNB with SumEncoder
Best parameter (CV score=0.616): {'classifier__var_smoothing': 10.0}
Best score: 0.4620

GNB with TargetEncoder
Best parameter (CV score=0.692): {'classifier__var_smoothing': 0.0001}
Best score: 0.6568

GNB with WOEEncoder
Best parameter (CV score=0.696): {'classifier__var_smoothing': 0.01}
Best score: 0.6524



### Comparison with manual encoding from previous notebook + `total_funding_usd`

In [6]:
# Comparison
df_b4 = df.drop(['category_groups','country','employee_size'], axis=1)
df_b4 = df_b4.drop(df_b4.columns.to_list()[-46:], axis=1)

# Sample
df_p1 = df_b4[df_b4['p1_tag']==1]
df_notp1 = df_b4[df_b4['p1_tag']==0].sample(n=df_p1.shape[0], replace=False)
df_b4 = pd.concat([df_p1, df_notp1]).reset_index(drop=True)
df_b4 = reduce_mem_usage(df_b4)

# Impute missing data in employee_count and rank columns
imputer = SimpleImputer(missing_values=-1, strategy='median')
df_b4['employee_count'] = imputer.fit_transform(df_b4['employee_count'].values.reshape(-1, 1))
imputer = SimpleImputer(strategy='median')
df_b4['rank'] = imputer.fit_transform(df_b4['rank'].values.reshape(-1, 1))
imputer = SimpleImputer(strategy='mean')
df_b4['total_funding_usd'] = imputer.fit_transform(df_b4['total_funding_usd'].values.reshape(-1, 1))
df_num_missing = df_b4[['rank', 'employee_count', 'total_funding_usd']].isna().sum()/len(df_b4)
output_string = df_num_missing.to_string(float_format=lambda x: '{:.2f}%'.format(x*100))
print('\nMISSING VALUES BY PERCENTAGE')
print(output_string)

# Scale numeric values
#########################################
#########################################

X = df_b4.drop('p1_tag', axis=1)
y = df_b4['p1_tag']
y = preprocessing.LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('\nTraining data shape:', X_train.shape)
print('Train label shape:', y_train.shape)
print('Test data shape:',  X_test.shape)
print('Test label shape:', y_test.shape)

KNN = KNeighborsClassifier(n_neighbors=30, p=1, leaf_size=25)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print('\nKNN Accuracy score: {:.4f}'.format(KNN.score(X_test, y_test)))

LR = LogisticRegression(C=10)
LR.fit(X_train, y_train)
print('LRR Accuracy score: {:.4f}'.format(LR.score(X_test, y_test)))

Mem. usage decreased to  1.08 Mb (86.2% reduction)

MISSING VALUES BY PERCENTAGE
rank                0.00%
employee_count      0.00%
total_funding_usd   0.00%

Training data shape: (12531, 64)
Train label shape: (12531,)
Test data shape: (3133, 64)
Test label shape: (3133,)

KNN Accuracy score: 0.6336
LRR Accuracy score: 0.5918


In [10]:
import json

with open('results_baseline.json', 'w') as fp:
    json.dump(results, fp, sort_keys=True, indent=4)

In [11]:
with open('results_baseline.json', 'r') as fp:
    results = json.load(fp)
print(results)

{'BNB': {'BackwardDifferenceEncoder': {'best_params': {'classifier__alpha': 0.0001}, 'score': 0.26653974297953353}, 'BaseNEncoder': {'best_params': {'classifier__alpha': 10.0}, 'score': 0.6556420233463035}, 'BinaryEncoder': {'best_params': {'classifier__alpha': 10.0}, 'score': 0.6556420233463035}, 'CatBoostEncoder': {'best_params': {'classifier__alpha': 0.0001}, 'score': 0.6343825665859564}, 'HashingEncoder': {'best_params': {'classifier__alpha': 0.0001}, 'score': 0.5817531305903398}, 'HelmertEncoder': {'best_params': {'classifier__alpha': 1.0}, 'score': 0.6990725935401342}, 'JamesSteinEncoder': {'best_params': {'classifier__alpha': 0.0001}, 'score': 0.6237225147104367}, 'LeaveOneOutEncoder': {'best_params': {'classifier__alpha': 10.0}, 'score': 0.6343825665859564}, 'MEstimateEncoder': {'best_params': {'classifier__alpha': 0.0001}, 'score': 0.6343825665859564}, 'OneHotEncoder': {'best_params': {'classifier__alpha': 1.0}, 'score': 0.7004784688995215}, 'OrdinalEncoder': {'best_params': {