In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
        
from pathlib import Path
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# from lightgbm import LGBMClassifier
#ignore warning messages 
# import warnings
# warnings.filterwarnings('ignore') 
# import random 
# from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold


from sklearn.metrics import log_loss

path = Path('/kaggle/input/tabular-playground-series-nov-2022/')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
submission = pd.read_csv(path / 'sample_submission.csv', index_col='id')
labels = pd.read_csv(path / 'train_labels.csv', index_col='id')

# the ids of the submission rows (useful later)
sub_ids = submission.index

# the ids of the labeled rows (useful later)
gt_ids = labels.index 

# list of files in the submission folder
subs = sorted(os.listdir(path / 'submission_files'))

**Downloading all data to X_train dataframe**

In [None]:
s0 = pd.read_csv(path / 'submission_files' / subs[0], index_col='id')

X_train = np.zeros((s0.shape[0], len(subs)))
for i, name in enumerate(subs):
    sub = pd.read_csv(path / 'submission_files' / name, index_col='id')
    X_train[:,i] = sub.pred.values
X_train = pd.DataFrame(X_train, columns=subs)

In [None]:
X_train.head(10)

**Dropping bad datasets**

In [None]:
X_train = X_train.clip(0.0000001, 0.99999999)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score
aucs = {}
acc = {}

for column in X_train.columns:
    aucs[column] = roc_auc_score(labels, X_train[:20000][column])
    acc[column] = accuracy_score(labels, np.round(X_train[:20000][column]))
    
auc_df = pd.DataFrame.from_dict(aucs, orient='index').squeeze().rename('AUC')
acc_df = pd.DataFrame.from_dict(acc, orient='index').squeeze().rename('ACC')
print('Models with flipped probabilites: ')
print(auc_df[auc_df.lt(0.5)]) 
print(acc_df[acc_df.lt(0.5)]) 

In [None]:
drop_cols1 = list(auc_df[auc_df.lt(0.5)].index)
drop_cols2 = list(acc_df[acc_df.lt(0.5)].index)

for item in drop_cols1:
    if item in drop_cols2:
        drop_cols2.remove(item)

# for column in drop_cols:
#     X_train[column] = 1 - X_train[column]
X_train = X_train.drop(drop_cols1, axis=1)
X_train = X_train.drop(drop_cols2, axis=1)

In [None]:
X_train['mean'] = X_train.mean(axis=1)
X_train['std'] = X_train.std(axis=1)

In [None]:
# X_train = X_train.loc[:, X_train.max()<=1]
# X_train = X_train.loc[:, X_train.min()>=0]

X_train.shape

**PCA**

Thanks to @infrarosso for PCA! Please, upvote his notebook!

https://www.kaggle.com/code/infrarosso/tps-nov-2022-eda-lgbm-stacking
(link for upvoting)

In [None]:
VARIANCE_TH = 0.98

pca=PCA()
pca_samples = pca.fit_transform(X_train)
total=sum(pca.explained_variance_)
k=0
current_variance=0
while current_variance/total <= VARIANCE_TH:
    current_variance += pca.explained_variance_[k]
    k=k+1
print(F"{VARIANCE_TH*100:.0f}% explained variance with {k}/{len(X_train.columns)} features")

In [None]:
pca=PCA(n_components=k)
X_train_reduced = pd.DataFrame(pca.fit_transform(X_train))
X_train_reduced.shape

In [None]:
X = X_train_reduced[0:20000]
X_test = X_train_reduced[20000:]
y = labels

In [None]:
from sklearn.model_selection import train_test_split

# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state=42)

Thanks @craigmthomas for some Catboost ideas

https://www.kaggle.com/code/craigmthomas/tps-nov-2022-catboost-starter
(link for upvoting :)

**Catboost model**

In [None]:
import catboost
from sklearn.model_selection import StratifiedKFold

n_folds = 10

k_fold = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True)

preds_eval = []
preds = []

MAX_ITER = 10000
PATIENCE = 100
DISPLAY_FREQ = 100

# 0.016 -> 0.015 -> 0.01475 -> 0.01462 -> 0.01459 -> 0.01456 <- 0.01453 <- 0.0145 <- 0.01425 <- 0.014 <- 0.01 <- 0.005

MODEL_PARAMS = {'random_seed': 1234,    
                'learning_rate': 0.01459,                
                'iterations': MAX_ITER,
                'early_stopping_rounds': PATIENCE,
                'metric_period': DISPLAY_FREQ,
                'use_best_model': True,
                'eval_metric': 'Logloss',
                'verbose': 1,
                'task_type': 'GPU'
               }

model = catboost.CatBoostClassifier(**MODEL_PARAMS)

for train_index, test_index in k_fold.split(X, y):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          metric_period = DISPLAY_FREQ
         )
    preds.append(model.predict_proba(X_test)[:,1])
    print("Logloss valid = {}".format(log_loss(y_valid, model.predict_proba(X_valid)[:,1])))
    print("Logloss full = {}".format(log_loss(labels, model.predict_proba(X)[:,1])))
    


In [None]:
# score = log_loss(labels, model.predict_proba(X))
# score

**Making prediction**

In [None]:
preds

In [None]:
pred = np.average(np.array(preds),axis=0)
pred

In [None]:
submission['pred'] = pred
submission

In [None]:
submission.to_csv('submission.csv')