In [1]:
import numpy as np
import pandas as pd


data = pd.read_csv("creditcard.csv")

data.head()

data['Class'].value_counts()

len(data['Class'])

print("percentage of fraudulent data instances: {}".format(data['Class'].value_counts()[1] *100 /len(data['Class'])))
print("percentage of normal data instances: {}".format(data['Class'].value_counts()[0] *100 /len(data['Class'])))

# Rescaling the data

from sklearn.preprocessing import RobustScaler

rs = RobustScaler()

data['scaled_amount'] = rs.fit_transform(data['Amount'].values.reshape(-1, 1))
data['scaled_time'] = rs.fit_transform(data['Time'].values.reshape(-1, 1))

data.drop(['Amount', 'Time'], axis = 1, inplace = True)

from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression

X = data.drop(['Class'], axis = 1)
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0, 
                                                    stratify = y)

print(X_train.shape, y_train.shape)
print(y_train.value_counts())



percentage of fraudulent data instances: 0.1727485630620034
percentage of normal data instances: 99.827251436938
(227845, 30) (227845,)
0    227451
1       394
Name: Class, dtype: int64


In [2]:
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB 
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from collections import Counter

In [3]:
from imblearn.under_sampling import RandomUnderSampler


In [8]:
all_results=[]
all_dataset=dict()

In [5]:
 def training_testing_function(X,y,name):
        corr = X.corr()
        columns = np.full((corr.shape[0],), True, dtype=bool)
        for i in range(corr.shape[0]):
            for j in range(i+1, corr.shape[0]):
                if corr.iloc[i,j] >= 0.9:
                    if columns[j]:
                        columns[j] = False
        selected_columns = X.columns[columns]
        X = X[selected_columns]


        rfc = RandomForestClassifier();

        # fit random forest classifier on the training set
        rfc.fit(X, y.values.ravel());
        # extract important features
        score = np.round(rfc.feature_importances_,3)

        importances = pd.DataFrame({'feature':X.columns,'importance':score})
        importances = importances.sort_values('importance',ascending=False).set_index('feature')

        final_features=list(importances[:20].index)

        X=X[final_features]

        # Train KNeighborsClassifier Model
        KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
        # KNN_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel()); 

        # Train LogisticRegression Model
        LGR_Classifier = LogisticRegression(multi_class='auto', random_state=1,solver='lbfgs',max_iter=400)
        # LGR_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel());

        # Train Gaussian Naive Baye Model
        GNB_Classifier = GaussianNB()
        # GNB_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel())

        # Train Decision Tree Model
        DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
        # DTC_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel());

        # Train RandomForestClassifier Model
        #RF_Classifier = RandomForestClassifier(criterion='entropy', n_jobs=-1, random_state=0)
        #RF_Classifier.fit(X_train, Y_train);  

        # Train SVM Model
        #SVC_Classifier = SVC(random_state=0)
        #SVC_Classifier.fit(X_train, Y_train)

        ## Train Ensemble Model (This method combines all the individual models above except RandomForest)
        combined_model = [('Naive Baye Classifier', GNB_Classifier), 
                         ('Decision Tree Classifier', DTC_Classifier), 
                         ('KNeighborsClassifier', KNN_Classifier), 
                         ('LogisticRegression', LGR_Classifier)
                        ]
        VC =  VotingClassifier(estimators = combined_model,voting = 'soft')
        VC.fit(X, y.values.ravel());

        models = []
        #models.append(('SVM Classifier', SVC_Classifier))
        # models.append(('Naive Baye Classifier', GNB_Classifier))
        # models.append(('Decision Tree Classifier', DTC_Classifier))
        #models.append(('RandomForest Classifier', RF_Classifier))
        # models.append(('KNeighborsClassifier', KNN_Classifier))
        # models.append(('LogisticRegression', LGR_Classifier))
        models.append(('VotingClassifier', VC))

        for i, v in models:
    #         accuracy = metrics.accuracy_score(y_test.values.ravel(), v.predict(X_test[final_features]))
    #         confusion_matrix = metrics.confusion_matrix(y_test.values.ravel(), v.predict(X_test[final_features]))
    #         classification = metrics.classification_report(y_test.values.ravel(), v.predict(X_test[final_features]))
            f1 = metrics.f1_score(y_test.values.ravel(),v.predict(X_test[final_features]))
            all_results.append((name,f1))
            print("{} Completed".format(name))
    #         print('============================== {} Model Test Results =============================='.format(i))
    #         print()
    #         print ("Model Accuracy:" "\n", accuracy)
    #         print()
    #         print("Confusion matrix:" "\n", confusion_matrix)
    #         print()
    #         print("Classification report:" "\n", classification) 
    #         print()        






#### 1. Random Undersample

In [4]:
undersample = RandomUnderSampler(sampling_strategy='majority')
# fit and apply the transform
all_dataset["X_train_undersample"], all_dataset["y_train_undersample"] = undersample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(all_dataset["y_train_undersample"]))

Counter({0: 394, 1: 394})


In [None]:
training_testing_function(all_dataset["X_train_undersample"],all_dataset["y_train_undersample"],"Random Undersample")

#### 2. Instance Hardness Threshold

In [5]:
from imblearn.under_sampling import InstanceHardnessThreshold

In [6]:
iht = InstanceHardnessThreshold(sampling_strategy='majority', random_state=42)
all_dataset["X_train_res"], all_dataset["y_train_res"] = iht.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(all_dataset["y_train_res"]))

Resampled dataset shape Counter({0: 222288, 1: 394})


In [6]:
training_testing_function(all_dataset["X_train_res"],all_dataset["y_train_res"],"IHT")

#### 3. Cluster Centroid

In [31]:
from imblearn.under_sampling import ClusterCentroids

In [32]:
all_dataset["X_cluster_centroids"],all_dataset["y_cluster_centroids"]  = ClusterCentroids().fit_resample(X_train,y_train)

In [34]:
training_testing_function(all_dataset["X_cluster_centroids"],all_dataset["y_cluster_centroids"],"Cluster centroids")

Cluster centroids Completed


In [36]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498),
 ('Cluster centroids', 0.3110307414104882)]

#### 4. Near Miss

In [39]:
from imblearn.under_sampling import NearMiss

In [40]:
all_dataset['X_near_miss'],all_dataset['y_near_miss'] = NearMiss().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_near_miss'],all_dataset['y_near_miss'],"Near Miss")

Near Miss Completed


In [41]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498),
 ('Cluster centroids', 0.3110307414104882),
 ('Near Miss', 0.007315406567788095)]

#### 5. One Sided Selection

In [43]:
from imblearn.under_sampling import OneSidedSelection

In [44]:
all_dataset['X_one_sided_selection'],all_dataset['y_one_sided_selection'] = OneSidedSelection().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_one_sided_selection'],all_dataset['y_one_sided_selection'],"One Sided Selection")

One Sided Selection Completed


In [45]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498),
 ('Cluster centroids', 0.3110307414104882),
 ('Near Miss', 0.007315406567788095),
 ('One Sided Selection', 0.7789473684210526)]

#### 6. Tomek Links

In [4]:
from imblearn.under_sampling import TomekLinks

In [None]:
all_dataset['X_tomek_links'],all_dataset['y_tomek_links'] = TomekLinks().fit_resample(X_train,y_train)


In [11]:
training_testing_function(all_dataset['X_tomek_links'],all_dataset['y_tomek_links'],"Tomek Links")

Tomek Links Completed


In [12]:
all_results

[('Tomek Links', 0.7914438502673796)]

### Oversampling data

#### 1. Random Oversampling

In [17]:
ros = RandomOverSampler(random_state=0)
all_dataset["X_train_random_oversampled"], all_dataset["y_train_random_oversampled"] = ros.fit_resample(X_train, y_train)
from collections import Counter
print(sorted(Counter(all_dataset["y_train_random_oversampled"]).items()))

[(0, 227451), (1, 227451)]


In [None]:
training_testing_function(all_dataset["X_train_random_oversampled"],all_dataset["y_train_random_oversampled"],"Random Oversampled")

#### 2. SMOTE

In [12]:
from imblearn.over_sampling import SMOTE, ADASYN
all_dataset["X_smote"], all_dataset["y_smote"] = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(all_dataset["y_smote"]).items()))



[(0, 227451), (1, 227451)]


In [16]:
training_testing_function(all_dataset['X_smote'],all_dataset['y_smote'],"SMOTE")

SMOTE Completed


In [17]:
all_results

[('SMOTE', 0.7545454545454545)]

#### 3. ADASYN

In [20]:
all_dataset['X_adasyn'],all_dataset['y_adasyn'] = ADASYN().fit_resample(X_train, y_train)
print(sorted(Counter(all_dataset["y_adasyn"]).items()))

[(0, 227451), (1, 227448)]


In [22]:
training_testing_function(all_dataset['X_adasyn'],all_dataset['y_adasyn'],"ADASYN")

ADASYN Completed


In [23]:
all_results

[('SMOTE', 0.7545454545454545), ('ADASYN', 0.6259541984732825)]

#### 4. Variations of SMOTE

In [24]:
from imblearn.over_sampling import BorderlineSMOTE,SVMSMOTE,KMeansSMOTE

In [29]:
all_dataset["X_borderline_smote"],all_dataset["y_borderline_smote"] = BorderlineSMOTE().fit_resample(X_train,y_train)
all_dataset["X_svm_smote"],all_dataset["y_svm_smote"] = SVMSMOTE().fit_resample(X_train,y_train)

# X_kmeans_smote,y_kmeans_smote = KMeansSMOTE().fit_resample(X_train,y_train)


# all_dataset["X_kmeans_smote"],all_dataset["y_kmeans_smote"]= X_kmeans_smote,y_kmeans_smote

training_testing_function(all_dataset["X_borderline_smote"],all_dataset["y_borderline_smote"],"Borderline SMOTE")
training_testing_function(all_dataset["X_svm_smote"],all_dataset["y_svm_smote"],"SVM SMOTE")

# training_testing_function(all_dataset["X_kmeans_smote"],all_dataset["y_kmeans_smote"],"KMeans SMOTE")



Borderline SMOTE Completed
SVM SMOTE Completed


In [30]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498)]

### Combination of Undersampling and Oversampling

#### 1. SMOTETomek

In [13]:
from imblearn.combine import SMOTETomek

In [14]:
all_dataset['X_smote_tomek'],all_dataset['y_smote_tomek'] = SMOTETomek().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_smote_tomek'],all_dataset['y_smote_tomek'],"SMOTE Tomek")

SMOTE Tomek Completed


In [15]:
all_results

[('Tomek Links', 0.7914438502673796), ('SMOTE Tomek', 0.7248908296943231)]

## Working with different dataset

In [9]:
dataset2 = pd.read_csv("dataset2.csv")
dataset2.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [20]:
dataset2.shape

(6362620, 11)

In [10]:
dataset2.isnull().values.any()

False

In [11]:
dataset2.drop("isFlaggedFraud",axis=1,inplace=True)


In [12]:
y=dataset2['isFraud']
dataset2.drop('isFraud',axis=1,inplace=True)
X=dataset2

Since the destination account balances being zero is a strong indicator of fraud, we do not impute the account balance (before the transaction is made) with a statistic or from a distribution with a subsequent adjustment for the amount transacted. Doing so would mask this indicator of fraud and make fraudulent transactions appear genuine. Instead, below we replace the value of 0 with -1 which will be more useful to a suitable machine-learning (ML) algorithm detecting fraud.

In [14]:
X.loc[(X.oldbalanceDest == 0) & (X.newbalanceDest == 0) & (X.amount != 0), ['oldbalanceDest', 'newbalanceDest']] = - 1

The data also has several transactions with zero balances in the originating account both before and after a non-zero amount is transacted. In this case, the fraction of such transactions is much smaller in fraudulent (0.3%) compared to genuine transactions (47%). Once again, from similar reasoning as above, instead of imputing a numerical value we replace the value of 0 with a null value.

In [18]:
X.loc[(X.oldbalanceOrg == 0) & (X.newbalanceOrig == 0) & (X.amount != 0),['oldbalanceOrg', 'newbalanceOrig']] = np.nan

Motivated by the possibility of zero-balances serving to differentiate between fraudulent and genuine transactions, we create 2 new features (columns) recording errors in the originating and destination accounts for each transaction. These new features turn out to be important in obtaining the best performance from the ML algorithm that we will finally use.

In [19]:
X['errorbalanceOrg'] = X.newbalanceOrig + X.amount - X.oldbalanceOrg
X['errorbalanceDest'] = X.oldbalanceDest + X.amount - X.newbalanceDest