In this document we transform Parameter 5~10 to categorical variables and implement target mean encoding and weighted average encoding to see the effect. Here the labels are treated as oridinal varibales.

In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import log_loss, accuracy_scoreß
from collections import defaultdict

import warnings
warnings.filterwarnings("ignore")

from multiprocessing import Pool

In [2]:
# read in data
training = pd.read_csv('first_round_training_data.csv')
testing = pd.read_csv('first_round_testing_data.csv')
features = ["Parameter5","Parameter6","Parameter7","Parameter8","Parameter9","Parameter10"]

In [3]:
code = {'Pass':1, 'Good':2, 'Excellent':3, 'Fail':0}
training['new_Quality'] = training['Quality_label'].apply(lambda x : code[x])

In [4]:
def to_category_fit(values):
    '''
    This function takes a group of values as input and returns a dictionary, whose keys are original values
    '''
    codebook = defaultdict(lambda : 0)
    count = 0
    for i in values:
        if i not in codebook.keys():
            count += 1
            codebook[i] = count
    return codebook

In [5]:
def to_category_apply(values, codebook):
    '''
    This function takes a group of values and the codebook to use as inputs, returns the encoded variable
    This codebook should be the output of to_category_fit()
    '''
    codes = [codebook[value] for value in values]
    return codes

In [6]:
codebook = to_category_fit(training['Parameter5'])
to_category_apply(training['Parameter5'], codebook)[:10]

[1, 2, 2, 2, 2, 3, 4, 5, 5, 6]

In [7]:
def target_mean_encoding_fit(variable, targets):
    '''
    This function returns the codebook to target mean encoding of the input categorical variable
    Input variable can be integers, strings or values, targets are the labels, which should be float numbers
    
    It returns the codebook from variable values to its encoding
    '''
    df = pd.DataFrame({'variable': variable, 'targets':targets})
    overall_mean = np.mean(targets)
    codebook = defaultdict(lambda : overall_mean)
    df = df.groupby(['variable'], as_index=False).agg('mean')
    for i in range(df.shape[0]):
        codebook[df.iloc[i, 0]] = df.iloc[i, 1]
    return codebook

In [8]:
def target_mean_encoding_apply(variable, codebook):
    '''
    This function returns the target mean encoding of the input categorical variable
    Input variable can be integers, strings or values, codebook is the rules to follow to get encodings.
    This codebook should be the output of traget_mean_encoding_fit()
    
    It returns the encoded values
    '''
    result = [codebook[value] for value in variable]
    return result

In [9]:
def weighted_average_encoding_apply(variable, codebook, k, f):
    '''
    This function returns the weighted average encoding of the input categorical variable
    Input variable can be integers, strings or values, codebook is the rules to follow to get encodings.
    This codebook should be the output of target_mean_encoding_fit()
    
    It returns the encoded values
    
    Attention: the weight is kind of sensitive the the values, so please make sure
    you have normalized the values before using this encoding
    '''
    overall_mean = codebook['this key cannot exist']
    target_means = np.array([codebook[value] for value in variable])
    self_weights = 1/(1+np.exp(k-np.array(variable)/f))
    result = self_weights * target_means + (1-self_weights) * overall_mean
    return result

In [10]:
codebook = target_mean_encoding_fit(training['Parameter5'], training['new_Quality'])
target_mean_encoding_apply(training['Parameter5'], codebook)[:10]

[0.7142857142857143,
 1.2777777777777777,
 1.2777777777777777,
 1.2777777777777777,
 1.2777777777777777,
 1.4027777777777777,
 1.691358024691358,
 1.7912087912087913,
 1.7912087912087913,
 1.7619047619047619]

In [11]:
#weighted_average_encoding_apply(training['Parameter5'], codebook)[:10]

In [12]:
def get_target_mean_encoding(training, validation, col_list, target_name):
    '''
    Use previous funcionts to transform the columns in col_list to their mean target encoding
    The transformation is in both training and validation set
    target_name is the name of column which is the label
    
    Return the transformed training/validation datasets
    '''
    for column in col_list:
        codebook = target_mean_encoding_fit(training[column].values, training[target_name].values)
        training.loc[:, column] = target_mean_encoding_apply(training[column], codebook)
        validation.loc[:, column] = target_mean_encoding_apply(validation[column], codebook)
    
    return training, validation

In [13]:
def get_weighted_average_encoding(training, validation, col_list, target_name, k, f):
    '''
    Use previous funcionts to transform the columns in col_list to their mean target encoding
    The transformation is in both training and validation set
    target_name is the name of column which is the label
    k is inflection point and f is steepness
    
    Return the transformed training/validation datasets
    '''
    

In [14]:
train = training[features + ['new_Quality']]
test = testing[features + ['Group']]
train, test = get_target_mean_encoding(train, test, features, 'new_Quality')

In [15]:
train[:10]

Unnamed: 0,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,new_Quality
0,0.714286,1.0,1.36,1.301518,1.368514,1.444444,1
1,1.277778,1.415094,1.36,1.301518,1.368514,1.444444,0
2,1.277778,1.418182,1.36,1.301518,1.368514,1.444444,0
3,1.277778,1.415094,1.36,1.301518,1.368514,1.444444,0
4,1.277778,1.415094,1.36,1.301518,1.368514,1.444444,0
5,1.402778,1.363636,1.36,1.301518,1.368514,1.444444,2
6,1.691358,1.476744,1.791259,1.814165,1.431075,1.365079,2
7,1.791209,1.847826,1.791259,1.814165,1.431075,1.365079,2
8,1.791209,1.138462,1.791259,1.814165,1.431075,1.365079,1
9,1.761905,1.346939,1.791259,1.814165,1.431075,1.365079,2


In [16]:
test[:10]

Unnamed: 0,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Group
0,1.791209,1.816667,1.670707,1.691781,1.836601,1.484333,0
1,0.5,1.206897,1.791259,1.01227,0.963576,1.58,0
2,0.5,1.206897,1.791259,1.01227,0.963576,1.58,0
3,1.484333,1.847826,1.670707,1.814165,1.484333,1.365079,0
4,1.521739,1.476744,1.670707,1.814165,1.484333,1.365079,0
5,1.521739,1.476744,1.670707,1.814165,1.484333,1.365079,0
6,1.792453,1.476744,1.670707,1.814165,1.484333,1.365079,0
7,1.792453,1.476744,1.670707,1.814165,1.484333,1.365079,0
8,1.521739,1.875,1.670707,1.814165,1.484333,1.365079,0
9,1.521739,1.476744,1.670707,1.814165,1.484333,1.365079,0


In [17]:
N = 5

skf = StratifiedKFold(n_splits=N)
indices = []
for train_index, test_index in skf.split(train[features], train[['new_Quality']]):
    indices.append([train_index, test_index])

In [18]:
def my_CV(i):
    train_index = indices[i][0]
    test_index = indices[i][1]
    model = CatBoostClassifier(iterations=2000, 
                               depth=8, 
                               learning_rate=0.005, 
                               silent=True, 
                               loss_function='MultiClass', 
                               random_state=666)
    X = train.loc[train_index, features]
    y = train.loc[train_index, ['new_Quality']]
    model.fit(X, y)
    X = train.loc[test_index, features]
    y = train.loc[test_index, ['new_Quality']]
    probs = model.predict_proba(X)
    neg_log_loss = -log_loss(y, probs)
    predictions = model.predict(X)
    accuracy = accuracy_score(y, predictions)
    return (accuracy, neg_log_loss)

In [19]:
pool = Pool(processes = 8)
results = pool.map(my_CV, range(N))
np.mean(np.array(results), axis=0)

array([ 0.50082029, -1.14111656])

In [20]:
[ 0.50149112, -1.14072467] # 2000, 10, 0.005
[ 0.4993175 , -1.14390057] # 2000, 10, 0.01
[ 0.50265168, -1.14782422] # 2500, 10, 0.01
[ 0.50232307, -1.14001423] # 2500, 10, 0.005
[ 0.50281834, -1.14255961] # 2000, 8, 0.01
[ 0.48865443, -1.16249407] # 2500, 10, 0.001
[ 0.50398821, -1.13970718] # 2500, 8, 0.005

[0.50398821, -1.13970718]

In [21]:
clf = CatBoostClassifier(iterations=2500, depth=8, learning_rate=0.005, verbose=100, loss_function='MultiClass', random_state=666)
clf.fit(training[features], training['new_Quality'])
probs = clf.predict_proba(testing[features])
testing['prob_Fail'] = 0
testing['prob_Pass'] = 0
testing['prob_Good'] = 0
testing['prob_Excellent'] = 0
testing.loc[:,['prob_Fail','prob_Pass','prob_Good','prob_Excellent']] = probs

0:	learn: 1.3836406	total: 87.9ms	remaining: 3m 39s
100:	learn: 1.2319390	total: 3.36s	remaining: 1m 19s
200:	learn: 1.1655877	total: 6.58s	remaining: 1m 15s
300:	learn: 1.1316289	total: 9.81s	remaining: 1m 11s
400:	learn: 1.1119287	total: 13.1s	remaining: 1m 8s
500:	learn: 1.0991515	total: 16.2s	remaining: 1m 4s
600:	learn: 1.0898771	total: 19.4s	remaining: 1m 1s
700:	learn: 1.0831931	total: 22.4s	remaining: 57.6s
800:	learn: 1.0779284	total: 25.4s	remaining: 54s
900:	learn: 1.0734154	total: 28.6s	remaining: 50.8s
1000:	learn: 1.0697117	total: 31.8s	remaining: 47.6s
1100:	learn: 1.0662009	total: 34.9s	remaining: 44.3s
1200:	learn: 1.0629203	total: 37.9s	remaining: 41s
1300:	learn: 1.0599304	total: 41s	remaining: 37.8s
1400:	learn: 1.0572551	total: 44s	remaining: 34.5s
1500:	learn: 1.0546755	total: 47.1s	remaining: 31.3s
1600:	learn: 1.0521720	total: 50.1s	remaining: 28.1s
1700:	learn: 1.0499307	total: 53.2s	remaining: 25s
1800:	learn: 1.0476922	total: 56.3s	remaining: 21.9s
1900:	lear

In [22]:
prediction = testing.groupby(['Group'],as_index=False)['prob_Excellent','prob_Good','prob_Pass','prob_Fail'].mean()
prediction.columns = ['Group','Excellent ratio','Good ratio','Pass ratio','Fail ratio']
prediction.to_csv('category.csv',index=False)