<a href="https://www.kaggle.com/code/anirudhg15/single-model-tps-may-2022-w-xgb-w-b?scriptVersionId=135566154" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
import wandb
from sklearn import pipeline
from sklearn import compose
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute
from sklearn import metrics
from wandb.xgboost import WandbCallback
from kaggle_secrets import UserSecretsClient

In [None]:
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("wandb_api")
wandb.login(key=api_key)

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')

In [None]:
df_train['f_27' + '_unique_len'] = df_train['f_27'].apply(set).apply(len)
df_test['f_27' + '_unique_len'] = df_test['f_27'].apply(set).apply(len)

In [None]:
for i in range(10):
    df_train[f'f_27_ord{i}'] = df_train['f_27'].str.get(i).apply(ord) - ord('A')
    df_test[f'f_27_ord{i}'] = df_test['f_27'].str.get(i).apply(ord) - ord('A')

df_train = df_train.drop(['f_27'], axis=1)
df_test = df_test.drop(['f_27'], axis=1)

In [None]:
with wandb.init(entity=None, project='my-test-project', name='dev_run',
                job_type='train-val-split') as run:
    
    test_size = 0.2
    random_state = 42
    
    train_data, val_data = model_selection.train_test_split(df_train, test_size=test_size, random_state=random_state)

    
    split_log = wandb.Artifact(name='kaggle_tps_may_split', 
                        type='train-val-dataset',
                        description='Full-train dataset split into train and valiation',
                        metadata={'test_size': test_size, 
                                  'random_state': random_state}
                               )
    train_path = '/kaggle/working/train_split.csv'
    val_path = '/kaggle/working/val_split.csv'
    train_data.to_csv(train_path, index=False)
    val_data.to_csv(val_path, index=False)
    
    # Attach our processed data to the Artifact 
    split_log.add_file(train_path)
    split_log.add_file(val_path)
    
    # Log the Artifact
    run.log_artifact(split_log)

In [None]:
run = wandb.init(project='my-test-project', 
                 name='dev-run', 
                 job_type='model_pipeline_def')

In [None]:
numeric_transformer = pipeline.Pipeline(steps=[
    #('imputer', impute.IterativeImputer(random_state=42)),
    ('scaler', preprocessing.MinMaxScaler())
])

#categorical_transformer = pipeline.Pipeline(steps=[
 #   ('encoder', preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=9999999))
#])

In [None]:
preprocessor = compose.ColumnTransformer(transformers=[
    ('num', numeric_transformer, compose.make_column_selector(dtype_exclude='object'))
    #('cat', categorical_transformer, compose.make_column_selector(dtype_include='object'))
])

In [None]:
train_target = train_data['target'].copy()
train_features = train_data.drop(['target'], axis=1).copy()
val_target = val_data['target'].copy()
val_features = val_data.drop(['target'], axis=1).copy()

In [None]:
run = wandb.init(project='my-test-project', 
                 name='dev-run', 
                 job_type='model_training')

In [None]:
clf_pipe = pipeline.Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb_model', xgb.XGBClassifier(tree_method='gpu_hist',
                                    max_depth = 11,
                                    learning_rate = 0.15,
                                    n_estimators=3000,
                                    gamma = 0.2,
                                    subsample = 0.9,
                                    reg_lambda = 21,
                                    max_delta_step = 7,
                                    #colsample_bytree= 0.95,
                                    #colsample_bylevel= 0.7,
                                    random_state=42, 
                                    eval_metric='auc',
                                    callbacks=[WandbCallback()]))
])

In [None]:
clf_pipe.fit(train_features, train_target)

In [None]:
preds = clf_pipe.predict_proba(val_features)[:,1]

In [None]:
run.log({'roc_auc_score':metrics.roc_auc_score(val_target, preds)})

In [None]:
run.finish()

In [None]:
pred_test = clf_pipe.predict_proba(df_test)[:, 1]
sub = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv')
sub['target'] = pred_test
sub.to_csv('submission.csv', index=False)
sub.head()