<a href="https://www.kaggle.com/code/ahana09/ps-s3e4?scriptVersionId=118288111" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')
import plotly.express as px

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s3e4/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s3e4/test.csv")
original=pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")
sub =pd.read_csv("/kaggle/input/playground-series-s3e4/sample_submission.csv")

In [None]:
train.head()

In [None]:
original.head()

In [None]:
print(train.shape)
print(test.shape)
print(original.shape)

In [None]:
train=train.drop('id',axis=1)

In [None]:
train.info()

In [None]:
plt.figure(figsize=(25,25))
sns.heatmap(train.corr(),annot=True)
plt.show()

In [None]:
train.nunique().sort_values()

In [None]:
correlation_with_target=train.corr()['Class']
correlation_with_target=correlation_with_target.abs().sort_values(ascending=False)
correlation_with_target

* Features V3, V1, V14, V8 have high correlation with target

In [None]:
correlation_with_amount=train.corr()['Amount']
correlation_with_amount=correlation_with_amount.abs().sort_values(ascending=False)
correlation_with_amount

* Features V2 and V20 have correlation with Amount which are respectively 0.56 and 0.53

In [None]:
sns.countplot(x = 'Class', hue = 'Class', data = train);

* Train dataset is heavily imbalanced.

# Pre-processing

In [None]:
original['Class'] = (original['Class'] == 1).astype(np.int64)
original = original[list(train.columns)]
train = pd.concat([train, original]).reset_index(drop=True)

In [None]:
train.shape

In [None]:
test=test.drop('id',axis=1)
X = train.drop('Class', axis=1)
y = train['Class']

In [None]:
rs=RobustScaler()
train[train.columns]=rs.fit_transform(train[train.columns])
test[test.columns]=rs.fit_transform(test[test.columns])

# Modeling

In [None]:
SPLITS = 5
RANDOM = 50
ESTIMATORS = 100
clfs = []
scores = []
y_pred = []

In [None]:
skf = StratifiedKFold(n_splits = SPLITS, shuffle = True, random_state = RANDOM)

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.loc[train_idx], X.loc[test_idx]    
    y_train, y_test = y[train_idx], y[test_idx]    
    clf = CatBoostClassifier(n_estimators = ESTIMATORS, task_type="GPU")
    clf.fit(X_train.values, y_train, eval_set = [(X_test, y_test)], early_stopping_rounds=20, verbose=False)
    preds = clf.predict_proba(X_test.values)    
    clfs.append(clf)
    scores.append(roc_auc_score(y_test, preds[:, 1]))
print(f'mean score: {np.mean(scores)}')

In [None]:
skf = StratifiedKFold(n_splits = SPLITS, shuffle = True, random_state = RANDOM)

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.loc[train_idx], X.loc[test_idx]    
    y_train, y_test = y[train_idx], y[test_idx]    
    clf1 = XGBClassifier(n_estimators = ESTIMATORS,tree_method='gpu_hist')
    clf1.fit(X_train, y_train, eval_set = [(X_test, y_test)], early_stopping_rounds=20, verbose=False)
    preds = clf1.predict_proba(X_test)    
    clfs.append(clf1)
    scores.append(roc_auc_score(y_test, preds[:, 1]))
print(f'mean score: {np.mean(scores)}')

In [None]:
skf = StratifiedKFold(n_splits = SPLITS, shuffle = True, random_state = RANDOM)

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.loc[train_idx], X.loc[test_idx]    
    y_train, y_test = y[train_idx], y[test_idx]    
    clf1 = LGBMClassifier(n_estimators = ESTIMATORS,device='gpu')
    clf1.fit(X_train, y_train, eval_set = [(X_test, y_test)],early_stopping_rounds=20, verbose=False)
    preds = clf1.predict_proba(X_test)    
    clfs.append(clf1)
    scores.append(roc_auc_score(y_test, preds[:, 1]))
print(f'mean score: {np.mean(scores)}')

In [None]:
for c in clfs:
    preds = c.predict_proba(test)
    y_pred.append(preds[:, 1])

In [None]:
final_pred=np.stack(y_pred).mean(0)
final_pred

In [None]:
sub['Class'] = final_pred

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv',index=False)