In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report,accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE



In [4]:
data = pd.read_csv("creditcard.csv")
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time','Amount'],axis=1)
X = data.iloc[:,data.columns != 'Class']
Y = data.iloc[:,data.columns == 'Class']
pca = PCA(n_components=25)
X = pca.fit_transform(X)
X_train,testX,Y_train,testY = train_test_split(X,Y,test_size=0.20,random_state=21, stratify=Y)
X_train,valX,Y_train,valY = train_test_split(X_train,Y_train,test_size=0.20,random_state=21, stratify=Y_train)

In [5]:
X_train = np.array(X_train)
valX = np.array(valX)
testX = np.array(testX)
Y_train = np.array(Y_train)
valY = np.array(valY)
testY = np.array(testY)

# Cross Validation

In [None]:
xg = XGBClassifier()
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),'n_estimators': range(50,100),'min_child_weight':range(2,4),
              'gamma': range(0,3)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,cv=5,scoring='f1',n_jobs=-1)
CV_lr.fit(X=X_train,y=Y_train)
best_param = CV_lr.best_params_
print("Best Paramters for 50/50 Splits: ",best_param)

# Test Set Predictions

In [10]:
xg = XGBClassifier(n_estimators=70,learning_rate=0.31, gamma=0, min_child_weight=2)
xg.fit(X_train,Y_train)
y_pred1 = xg.predict(valX)
y_pred2 = xg.predict(testX)
print ("Score on train set is: ", accuracy_score(valY,y_pred1))
print ("Score for test data is", accuracy_score(testY,y_pred2))
print("Classification report for train set")
print(classification_report(valY,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(valY,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(testY,y_pred2))
print("Classification report for test set")
print(classification_report(testY,y_pred2))

Score on train set is:  0.9994294366784436
Score for test data is 0.9995786664794073
Classification report for train set
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     45490
          1       0.96      0.70      0.81        79

avg / total       1.00      1.00      1.00     45569

Confusion matrix for train set
[[45488     2]
 [   24    55]]
Confusion matrix for train set
[[56858     6]
 [   18    80]]
Classification report for test set
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     56864
          1       0.93      0.82      0.87        98

avg / total       1.00      1.00      1.00     56962

