In [136]:
import pandas as pd
import xgboost as xgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [99]:
ETA=0.3
MAX_DEPTH=3
MIN_CHILD_WEIGHT=1
NUM_BOOST_ROUND =100

In [100]:
!unzip -o "../dataset/creditcard.zip" -d "../dataset/"

Archive:  ../dataset/creditcard.zip
  inflating: ../dataset/creditcard.csv  


In [101]:
df=pd.read_csv("../dataset/creditcard.csv")

In [102]:
df.columns = df.columns.str.lower()

In [103]:
df_full_train, df_test = train_test_split(df, test_size=0.20, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [104]:
y_train = df_train["class"].values
y_val = df_val["class"].values
y_test = df_test["class"].values
y_full_train = df_full_train["class"].values

In [105]:
del df_train["class"]
del df_val["class"]
del df_test["class"]
del df_full_train["class"]

In [106]:
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
df_full_train=df_full_train.reset_index(drop=True)

In [107]:
def prepare_data(df_train, y_train):
    dv = DictVectorizer(sparse=False)
    
    train_dicts = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    features = list(dv.get_feature_names_out())
    dMatrix_train = xgb.DMatrix(X_train, label=y_train, feature_names=features)

    return dMatrix_train, dv

In [108]:
def train (df_train, y_train):
    dMatrix, dv = prepare_data(df_train, y_train)
    xgb_params ={
        'eta': ETA,
        'max_depth': MAX_DEPTH,
        'min_child_weight': MIN_CHILD_WEIGHT,
        'objective': 'binary:logistic',
        'seed': 1,
        'verbosity': 1
    }
    model_xgb=xgb.train(xgb_params, dMatrix, num_boost_round=NUM_BOOST_ROUND)
    return model_xgb, dv

In [125]:
def predict (df, dv, model):
    dicts = df.to_dict(orient='records')
    X=dv.transform(dicts)
    features = list(dv.get_feature_names_out())
    dMatrix = xgb.DMatrix(X, feature_names=features)
    y_pred_proba = model.predict(dMatrix)

    return y_pred_proba

In [126]:
#training the model
model, dv = train(df_full_train,y_full_train)

In [131]:
#checking that the model has a good performance
y_test_pred_proba=predict(df_test, dv, model)
auc = round(roc_auc_score(y_test, y_test_pred_proba),3)
auc

0.965

In [145]:
#saving the model
output_file='../model/xgboost_eta=%s_depth=%s_minchild=%s_round=%s.bin'%(ETA, MAX_DEPTH, MIN_CHILD_WEIGHT, NUM_BOOST_ROUND)
output_file

'../model/xgboost_eta=0.3_depth=3_minchild=1_round=100.bin'

In [146]:
with open(output_file, 'wb') as f_out:
    pickle.dump((model,dv), f_out)