In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier
from transforms import *

import warnings
warnings.filterwarnings("ignore")

pca = PCA()

In [2]:
training = pd.read_csv('first_round_training_data.csv')
testing = pd.read_csv('first_round_testing_data.csv')
features = ["Parameter5","Parameter6","Parameter7","Parameter8","Parameter9","Parameter10"]

training[features] = np.log(training[features].values)/np.log(10)
testing[features] = np.log(testing[features].values)/np.log(10)

code = {'Pass':1, 'Good':2, 'Excellent':3, 'Fail':0}
training['new_Quality'] = training['Quality_label'].apply(lambda x : code[x])

In [3]:

# this is original data
new_features = ['Parameter'+str(i) for i in [5, 7, 8, 9, 10]]
train_all, test_all = training.copy(), testing.copy()
new_features = new_features


In [4]:
'''
# this is log data and PCA
new_features = ['Parameter'+str(i) for i in range(5, 11)]
train_all, test_all = training.copy(), testing.copy()
new_values = pca.fit_transform(pd.concat([train_all[new_features], test_all[new_features]]))
train_all[new_features] = new_values[:6000, :].copy()
test_all[new_features] = new_values[6000:, :].copy()
new_features = new_features[:6]
'''

"\n# this is log data and PCA\nnew_features = ['Parameter'+str(i) for i in range(5, 11)]\ntrain_all, test_all = training.copy(), testing.copy()\nnew_values = pca.fit_transform(pd.concat([train_all[new_features], test_all[new_features]]))\ntrain_all[new_features] = new_values[:6000, :].copy()\ntest_all[new_features] = new_values[6000:, :].copy()\nnew_features = new_features[:6]\n"

In [5]:
'''
# this is Target Mean Encoding
train_all, test_all, new_features = get_all_encoding(training, testing, features)
new_values = pca.fit_transform(pd.concat([train_all[new_features], test_all[new_features]]))
train_all[new_features] = new_values[:6000, :].copy()
test_all[new_features] = new_values[6000:, :].copy()
new_features = new_features[:15]
'''

'\n# this is Target Mean Encoding\ntrain_all, test_all, new_features = get_all_encoding(training, testing, features)\nnew_values = pca.fit_transform(pd.concat([train_all[new_features], test_all[new_features]]))\ntrain_all[new_features] = new_values[:6000, :].copy()\ntest_all[new_features] = new_values[6000:, :].copy()\nnew_features = new_features[:15]\n'

In [6]:
'''
# this is Weight of Evidence Encoding
train_all, test_all, new_features = get_all_WoE(training, testing, features)
new_values = pca.fit_transform(pd.concat([train_all[new_features], test_all[new_features]]))
train_all[new_features] = new_values[:6000, :].copy()
test_all[new_features] = new_values[6000:, :].copy()
new_features = new_features[:17]
'''

'\n# this is Weight of Evidence Encoding\ntrain_all, test_all, new_features = get_all_WoE(training, testing, features)\nnew_values = pca.fit_transform(pd.concat([train_all[new_features], test_all[new_features]]))\ntrain_all[new_features] = new_values[:6000, :].copy()\ntest_all[new_features] = new_values[6000:, :].copy()\nnew_features = new_features[:17]\n'

In [7]:
depth = 10
learning_rate = 0.1
l2_leaf_reg = 10
min_data_in_leaf = 10

In [8]:
model = CatBoostClassifier(iterations = 3000, 
                           depth = depth, 
                           learning_rate = learning_rate, 
                           silent = True, 
                           loss_function = 'MultiClass', 
                           l2_leaf_reg = l2_leaf_reg,
                           od_type = 'Iter',
                           od_wait = 100)

In [9]:
N = 5

skf = StratifiedKFold(n_splits=N, shuffle=True)
indices = []
for train_index, test_index in skf.split(training[features], training[['new_Quality']]):
    indices.append([train_index, test_index])

In [10]:
prob_predict = np.zeros((6000, 4))
for j in range(N):
    train_index = indices[j][0]
    test_index = indices[j][1]
    X_train = train_all.loc[train_index, new_features]
    y_train = train_all.loc[train_index, ['new_Quality']]
    X_test = train_all.loc[test_index, new_features]
    y_test = train_all.loc[test_index, ['new_Quality']]
    model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=250)
    prob_predict += model.predict_proba(test_all[new_features])
    print('{} out of {} finished.'.format(j+1, N))
prob_predict /= N

0:	learn: 1.3390799	test: 1.3408722	best: 1.3408722 (0)	total: 153ms	remaining: 7m 39s
250:	learn: 1.0065242	test: 1.0842564	best: 1.0842564 (250)	total: 17.5s	remaining: 3m 11s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1.083715983
bestIteration = 286

Shrink model to first 287 iterations.
1 out of 5 finished.
0:	learn: 1.3414652	test: 1.3421876	best: 1.3421876 (0)	total: 86.6ms	remaining: 4m 19s
250:	learn: 1.0098129	test: 1.0702740	best: 1.0701862 (248)	total: 17.9s	remaining: 3m 15s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1.069924018
bestIteration = 350

Shrink model to first 351 iterations.
2 out of 5 finished.
0:	learn: 1.3420935	test: 1.3426830	best: 1.3426830 (0)	total: 86.4ms	remaining: 4m 19s
250:	learn: 1.0049409	test: 1.0804810	best: 1.0803066 (248)	total: 19.2s	remaining: 3m 30s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1.079486886
bestIteration = 287

Shrink model to first 288 iterations.
3 out of

In [11]:
def normalize(values):
    values = values/0.02
    upper = np.ceil(values)
    lower = np.floor(values)
    for i in range(values.shape[0]):
        smallest_error = 10000
        for a in [upper[i, 0], lower[i, 0]]:
            for b in [upper[i, 1], lower[i, 1]]:
                for c in [upper[i, 2], lower[i, 2]]:
                    for d in [upper[i, 3], lower[i, 3]]:
                        if a+b+c+d == 50:
                            new_value = np.array([a, b, c, d])
                            new_error = np.mean(np.abs(new_value - values[i, :]))
                            if new_error < smallest_error:
                                smallest_error = new_error
                                best_option = new_value
        values[i, :] = best_option.copy()
    return values*0.02

In [12]:
def get_prediction(prob_predict, submit=False, name='submission'):
    testing['Fail ratio'] = 0
    testing['Pass ratio'] = 0
    testing['Good ratio'] = 0
    testing['Excellent ratio'] = 0
    testing[['Fail ratio', 'Pass ratio', 'Good ratio', 'Excellent ratio']] = prob_predict
    submission = testing.groupby(['Group'], as_index=False).mean()
    submission = submission[['Group', 'Excellent ratio', 'Good ratio', 'Pass ratio', 'Fail ratio']]
    if submit:
        submission.to_csv('{}.csv'.format(name), index=False)
    matrix1 = submission[['Excellent ratio', 'Good ratio', 'Pass ratio', 'Fail ratio']].values.copy()
    matrix2 = normalize(matrix1)
    submission[['Excellent ratio', 'Good ratio', 'Pass ratio', 'Fail ratio']] = matrix2.copy()
    if submit:
        submission.to_csv('{}_rounded.csv'.format(name), index=False)
    return matrix1, matrix2

In [13]:
matrix1, matrix2 = get_prediction(prob_predict, True)