In [1]:
# https://www.kaggle.com/competitions/cyprus-ai-camp-binary-classification-in-space

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("/kaggle/input/cyprus-ai-camp-binary-classification-in-space/train.csv")
test = pd.read_csv("/kaggle/input/cyprus-ai-camp-binary-classification-in-space/test.csv")

train.shape, test.shape

((1000, 14), (998000, 14))

In [3]:
train['target'].value_counts(normalize=True)

target
1    0.868
0    0.132
Name: proportion, dtype: float64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   target     1000 non-null   int64  
 1   feature1   974 non-null    float64
 2   feature2   1000 non-null   float64
 3   feature3   976 non-null    float64
 4   feature4   979 non-null    float64
 5   feature5   975 non-null    float64
 6   feature6   924 non-null    float64
 7   feature7   1000 non-null   float64
 8   feature8   1000 non-null   float64
 9   feature9   1000 non-null   float64
 10  feature10  1000 non-null   int64  
 11  feature11  1000 non-null   int64  
 12  feature12  1000 non-null   int64  
 13  feature13  1000 non-null   int64  
dtypes: float64(9), int64(5)
memory usage: 109.5 KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998000 entries, 0 to 997999
Data columns (total 14 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   feature1   976769 non-null  float64
 1   feature2   998000 non-null  float64
 2   feature3   978045 non-null  float64
 3   feature4   978051 non-null  float64
 4   feature5   973052 non-null  float64
 5   feature6   898174 non-null  float64
 6   feature7   998000 non-null  float64
 7   feature8   998000 non-null  float64
 8   feature9   998000 non-null  float64
 9   feature10  998000 non-null  int64  
 10  feature11  998000 non-null  int64  
 11  feature12  998000 non-null  int64  
 12  feature13  998000 non-null  int64  
 13  Id         998000 non-null  int64  
dtypes: float64(9), int64(5)
memory usage: 106.6 MB


In [6]:
def add_na_info(df):
    df = pd.concat([df, df.isna().astype(int).loc[:, [c for c in df.columns if c not in ['Id', 'target']]].add_suffix("_isna")], axis=1)
    return df

train = add_na_info(train)
test = add_na_info(test)

In [7]:
train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)

In [8]:
features = [c for c in train.columns if c not in ['target']]
target_col = 'target'

In [9]:
from sklearn.model_selection import StratifiedKFold
from catboost import Pool
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits)

model_list, score_list= [], []

params = {
    'iterations': 200,
    'loss_function': 'Logloss',
    'eval_metric': 'Accuracy',
    'metric_period': 50,
    'max_depth': 4,
    'grow_policy': 'SymmetricTree',
    'random_state': 42
}

for i, (train_indices, valid_indices) in tqdm(enumerate(kfold.split(train[features], train[target_col])), total=n_splits):
    X_train, y_train = train.loc[train_indices, features], train.loc[train_indices, target_col]
    X_valid, y_valid = train.loc[valid_indices, features], train.loc[valid_indices, target_col]

    train_pool = Pool(X_train, y_train)
    valid_pool = Pool(X_valid, y_valid)

    print(f'STARTING TRAIN FOR SPLIT {i+1}/{n_splits}', end='')
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool)

    score = accuracy_score(y_valid.values, model.predict(X_valid))

    print(f' | SCORE: {score:.5f}')

    model_list.append(model)
    score_list.append(score)

  0%|          | 0/5 [00:00<?, ?it/s]

STARTING TRAIN FOR SPLIT 1/5Learning rate set to 0.060398
0:	learn: 0.8675000	test: 0.8700000	best: 0.8700000 (0)	total: 53.1ms	remaining: 10.6s
50:	learn: 0.9212500	test: 0.9250000	best: 0.9250000 (50)	total: 107ms	remaining: 313ms
100:	learn: 0.9462500	test: 0.9350000	best: 0.9350000 (100)	total: 163ms	remaining: 160ms
150:	learn: 0.9712500	test: 0.9350000	best: 0.9350000 (100)	total: 218ms	remaining: 70.6ms
199:	learn: 0.9875000	test: 0.9400000	best: 0.9400000 (199)	total: 271ms	remaining: 0us

bestTest = 0.94
bestIteration = 199

 | SCORE: 0.94000
STARTING TRAIN FOR SPLIT 2/5Learning rate set to 0.060398
0:	learn: 0.8950000	test: 0.8650000	best: 0.8650000 (0)	total: 2.26ms	remaining: 450ms
50:	learn: 0.9225000	test: 0.8750000	best: 0.8750000 (50)	total: 57.7ms	remaining: 169ms
100:	learn: 0.9462500	test: 0.9150000	best: 0.9150000 (100)	total: 114ms	remaining: 112ms
150:	learn: 0.9712500	test: 0.9100000	best: 0.9150000 (100)	total: 169ms	remaining: 54.8ms
199:	learn: 0.9887500	test:

In [10]:
np.array(score_list).mean(), np.array(score_list).std()

(np.float64(0.924), np.float64(0.00969535971483263))

In [11]:
final_model = model_list[np.array(score_list).argmax()]

# threshold_candidates = np.linspace(0, 1, 101)

# best_cand, best_got = 0, 1

# for cand in tqdm(threshold_candidates):
#     got = (final_model.predict_proba(test[features])[:, 1] >= cand).mean()
#     if abs(best_got-0.42628) > abs(got-0.42628): # VALUE EXTRACTED AFTER LEADERBOARD PRUNING
#         best_got, best_cand = got, cand

best_cand = 0.91

In [12]:
subm = pd.DataFrame({
    'Id': test['Id'],
    'target': (final_model.predict_proba(test[features])[:, 1] >= best_cand).astype(int)
})

subm.to_csv("submission.csv", index=False)
subm.head()

Unnamed: 0,Id,target
0,0,0
1,1,0
2,2,1
3,3,0
4,4,1


In [13]:
subm['target'].value_counts(normalize=True)

target
0    0.564893
1    0.435107
Name: proportion, dtype: float64