In [1]:
import numpy as np
import pandas as pd

from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB 
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler,StandardScaler
from imblearn.over_sampling import RandomOverSampler
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [2]:
data = pd.read_csv("creditcard.csv")

data.head()

data['Class'].value_counts()

len(data['Class'])

print("percentage of fraudulent data instances: {}".format(data['Class'].value_counts()[1] *100 /len(data['Class'])))
print("percentage of normal data instances: {}".format(data['Class'].value_counts()[0] *100 /len(data['Class'])))

# Rescaling the data



rs = RobustScaler()

data['scaled_amount'] = rs.fit_transform(data['Amount'].values.reshape(-1, 1))
data['scaled_time'] = rs.fit_transform(data['Time'].values.reshape(-1, 1))

data.drop(['Amount', 'Time'], axis = 1, inplace = True)


# from sklearn.linear_model import LogisticRegression

X = data.drop(['Class'], axis = 1)
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0, 
                                                    stratify = y)

print(X_train.shape, y_train.shape)
print(y_train.value_counts())



percentage of fraudulent data instances: 0.1727485630620034
percentage of normal data instances: 99.827251436938
(227845, 30) (227845,)
0    227451
1       394
Name: Class, dtype: int64


In [2]:
all_results=[]
all_dataset=dict()

In [3]:
 def training_testing_function(X,y,name):
        
        #removing highly correlated features
        corr = X.corr()
        columns = np.full((corr.shape[0],), True, dtype=bool)
        for i in range(corr.shape[0]):
            for j in range(i+1, corr.shape[0]):
                if corr.iloc[i,j] >= 0.8:
                    if columns[j]:
                        columns[j] = False
        selected_columns = X.columns[columns]
        X1 = X[selected_columns]
        
         # Train KNeighborsClassifier Model
        KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
        
        # Train LogisticRegression Model
        LGR_Classifier = LogisticRegression(multi_class='auto', random_state=1,solver='lbfgs',max_iter=400)

        # Train Gaussian Naive Baye Model
        GNB_Classifier = GaussianNB()

        # Train Decision Tree Model
        DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
        # DTC_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel());

        ## Train Ensemble Model (This method combines all the individual models above except RandomForest)
        combined_model = [('Naive Baye Classifier', GNB_Classifier), 
                         ('Decision Tree Classifier', DTC_Classifier), 
                         ('KNeighborsClassifier', KNN_Classifier), 
                         ('LogisticRegression', LGR_Classifier)
                        ]
        VC =  VotingClassifier(estimators = combined_model,voting = 'soft')
        VC.fit(X1, y.values.ravel());

        models = []
        models.append(('VotingClassifier', VC))

        for i, v in models:
            accuracy = metrics.accuracy_score(y_test.values.ravel(), v.predict(X_test[selected_columns]))
    #         confusion_matrix = metrics.confusion_matrix(y_test.values.ravel(), v.predict(X_test[final_features]))
    #         classification = metrics.classification_report(y_test.values.ravel(), v.predict(X_test[final_features]))
            f1 = metrics.f1_score(y_test.values.ravel(),v.predict(X_test[selected_columns]))
            auc_score = metrics.roc_auc_score(y_test.values.ravel(),v.predict(X_test[selected_columns]))
            
            all_results.append((name+str(" RCF"),accuracy,f1,auc_score)) #RCF = Removing correlated features
            print("{} Completed".format(name + str(" RCF")))
        
        
        
        
      

        rfc = RandomForestClassifier();

        # fit random forest classifier on the training set
        rfc.fit(X, y.values.ravel());
        # extract important features
        score = np.round(rfc.feature_importances_,3)

        importances = pd.DataFrame({'feature':X.columns,'importance':score})
        importances = importances.sort_values('importance',ascending=False).set_index('feature')

        min_features=min(len(importances),20)
        
        final_features=list(importances[:20].index)

        X2=X[final_features]

        # Train KNeighborsClassifier Model
        KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
        # KNN_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel()); 

        # Train LogisticRegression Model
        LGR_Classifier = LogisticRegression(multi_class='auto', random_state=1,solver='lbfgs',max_iter=400)
        # LGR_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel());

        # Train Gaussian Naive Baye Model
        GNB_Classifier = GaussianNB()
        # GNB_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel())

        # Train Decision Tree Model
        DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
        # DTC_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel());

        ## Train Ensemble Model (This method combines all the individual models above except RandomForest)
        combined_model = [('Naive Baye Classifier', GNB_Classifier), 
                         ('Decision Tree Classifier', DTC_Classifier), 
                         ('KNeighborsClassifier', KNN_Classifier), 
                         ('LogisticRegression', LGR_Classifier)
                        ]
        VC =  VotingClassifier(estimators = combined_model,voting = 'soft')
        VC.fit(X2, y.values.ravel());

        models = []

        models.append(('VotingClassifier', VC))

        for i, v in models:
            accuracy = metrics.accuracy_score(y_test.values.ravel(), v.predict(X_test[final_features]))
    #         confusion_matrix = metrics.confusion_matrix(y_test.values.ravel(), v.predict(X_test[final_features]))
    #         classification = metrics.classification_report(y_test.values.ravel(), v.predict(X_test[final_features]))
            f1 = metrics.f1_score(y_test.values.ravel(),v.predict(X_test[final_features]))
            auc_score = metrics.roc_auc_score(y_test.values.ravel(),v.predict(X_test[final_features]))
        
            all_results.append((name+str(" Random Forest  Selection"),accuracy,f1,auc_score))
            print("{} Completed".format(name+str(" Random Forest Selection")))
    #         print('============================== {} Model Test Results =============================='.format(i))
    #         print()
    #         print ("Model Accuracy:" "\n", accuracy)
    #         print()
    #         print("Confusion matrix:" "\n", confusion_matrix)
    #         print()
    #         print("Classification report:" "\n", classification) 
    #         print()     
        
#         clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

#         # Build step forward feature selection
#         sfs1 = sfs(clf,
#            k_features=5,
#            forward=True,
#            floating=False,
#            verbose=2,
#            scoring='f1',
#            cv=5)

#         # Perform SFFS
#         sfs1 = sfs1.fit(X, y.values.ravel())
        
#         feat_cols = np.array(sfs1.k_feature_idx_) - 1
        fs = SelectKBest(score_func=f_classif, k=5)
        # apply feature selection
        X3 = fs.fit_transform(X, y)
        
        X3_test = fs.transform(X_test)

                  
     # Train KNeighborsClassifier Model
        KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
        # KNN_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel()); 

        # Train LogisticRegression Model
        LGR_Classifier = LogisticRegression(multi_class='auto', random_state=1,solver='lbfgs',max_iter=400)
        # LGR_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel());

        # Train Gaussian Naive Baye Model
        GNB_Classifier = GaussianNB()
        # GNB_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel())

        # Train Decision Tree Model
        DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
        # DTC_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel());

        ## Train Ensemble Model (This method combines all the individual models above except RandomForest)
        combined_model = [('Naive Baye Classifier', GNB_Classifier), 
                         ('Decision Tree Classifier', DTC_Classifier), 
                         ('KNeighborsClassifier', KNN_Classifier), 
                         ('LogisticRegression', LGR_Classifier)
                        ]
        VC =  VotingClassifier(estimators = combined_model,voting = 'soft')
        VC.fit(X3, y.values.ravel());

        models = []

        models.append(('VotingClassifier', VC))

        for i, v in models:
            accuracy = metrics.accuracy_score(y_test.values.ravel(), v.predict(X3_test))
    #         confusion_matrix = metrics.confusion_matrix(y_test.values.ravel(), v.predict(X_test[final_features]))
    #         classification = metrics.classification_report(y_test.values.ravel(), v.predict(X_test[final_features]))
            f1 = metrics.f1_score(y_test.values.ravel(),v.predict(X3_test))
            auc_score = metrics.roc_auc_score(y_test.values.ravel(),v.predict(X3_test))
        
            all_results.append((name+str(" Forward feature selection"),accuracy,f1,auc_score))
            print("{} Completed".format(name+str(" Forward feature selection")))
    #         print('============================== {} Model Test Results =============================='.format(i))
    #         print()
    #         print ("Model Accuracy:" "\n", accuracy)
    #         print()
    #         print("Confusion matrix:" "\n", confusion_matrix)
    #         print()
    #         print("Classification report:" "\n", classification) 
    #         print()     
            






#### 1. Random Undersample

In [34]:
undersample = RandomUnderSampler(sampling_strategy='majority')
# fit and apply the transform
all_dataset["X_train_undersample"], all_dataset["y_train_undersample"] = undersample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(all_dataset["y_train_undersample"]))

Counter({0: 394, 1: 394})


In [35]:
training_testing_function(all_dataset["X_train_undersample"],all_dataset["y_train_undersample"],"Random Undersample")

Random Undersample RCF Completed
Random Undersample Random Forest Selection Completed


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   40.1s finished

[2021-03-23 12:36:49] Features: 1/5 -- score: 0.871118295201924[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed:   37.4s finished

[2021-03-23 12:37:26] Features: 2/5 -- score: 0.9229159825883422[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:   36.6s finished

[2021-03-23 12:38:03] Features: 3/5 -- score: 0.9319451559996944[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 o

Random Undersample Forward feature selection Completed


In [47]:
all_results

[('Random Undersample RCF',
  0.9735086548927355,
  0.1001788908765653,
  0.9154260290216255),
 ('Random Undersample Random Forest  Selection',
  0.9836031038236017,
  0.1539855072463768,
  0.9255751998323245),
 ('Random Undersample Forward feature selection',
  0.9614128717390541,
  0.06785411365564037,
  0.8889947228187842),
 ('IHT RCF', 0.9990344440153085, 0.7417840375586854, 0.9027446797514729),
 ('IHT Random Forest  Selection',
  0.9989291106351603,
  0.7214611872146119,
  0.9026919222950857),
 ('IHT Forward feature selection',
  0.9988588883817282,
  0.5962732919254659,
  0.7447660655427055),
 ('IHT RCF', 0.9990344440153085, 0.7417840375586854, 0.9027446797514729),
 ('IHT Random Forest  Selection',
  0.9990168884519505,
  0.7383177570093458,
  0.9027358868420752),
 ('IHT Forward feature selection',
  0.9987711105649381,
  0.5454545454545454,
  0.7141626135541442)]

#### 2. Instance Hardness Threshold

In [43]:
from imblearn.under_sampling import InstanceHardnessThreshold

In [62]:
iht = InstanceHardnessThreshold(sampling_strategy='majority', random_state=42)
all_dataset["X_train_res"], all_dataset["y_train_res"] = iht.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(all_dataset["y_train_res"]))

Resampled dataset shape Counter({0: 222288, 1: 394})


In [63]:
training_testing_function(all_dataset["X_train_res"],all_dataset["y_train_res"],"IHT")

IHT RCF Completed
IHT Random Forest Selection Completed
IHT Forward feature selection Completed


In [64]:
all_results

[('IHT RCF', 0.9990344440153085, 0.7417840375586854, 0.9027446797514729),
 ('IHT Random Forest  Selection',
  0.9990344440153085,
  0.7417840375586854,
  0.9027446797514729),
 ('IHT Forward feature selection',
  0.9991046662687406,
  0.7582938388625592,
  0.907873099295993)]

#### 3. Cluster Centroid

In [5]:
from imblearn.under_sampling import ClusterCentroids

In [6]:
all_dataset["X_cluster_centroids"],all_dataset["y_cluster_centroids"]  = ClusterCentroids().fit_resample(X_train,y_train)

In [7]:
training_testing_function(all_dataset["X_cluster_centroids"],all_dataset["y_cluster_centroids"],"Cluster centroids")

Cluster centroids RCF Completed
Cluster centroids Random Forest Selection Completed
Cluster centroids Forward feature selection Completed


In [8]:
all_results

[('Cluster centroids RCF',
  0.9212984094659598,
  0.036949516648764766,
  0.8994624122862425),
 ('Cluster centroids Random Forest  Selection',
  0.9923106632491837,
  0.2819672131147541,
  0.9350297308005926),
 ('Cluster centroids Forward feature selection',
  0.9918015519118009,
  0.26917057902973396,
  0.9347747364280545)]

#### 4. Near Miss

In [9]:
from imblearn.under_sampling import NearMiss

In [10]:
all_dataset['X_near_miss'],all_dataset['y_near_miss'] = NearMiss().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_near_miss'],all_dataset['y_near_miss'],"Near Miss")

Near Miss RCF Completed
Near Miss Random Forest Selection Completed
Near Miss Forward feature selection Completed


In [11]:
all_results

[('Cluster centroids RCF',
  0.9212984094659598,
  0.036949516648764766,
  0.8994624122862425),
 ('Cluster centroids Random Forest  Selection',
  0.9923106632491837,
  0.2819672131147541,
  0.9350297308005926),
 ('Cluster centroids Forward feature selection',
  0.9918015519118009,
  0.26917057902973396,
  0.9347747364280545),
 ('Near Miss RCF',
  0.5980653769179453,
  0.007456539645380847,
  0.7375673644528155),
 ('Near Miss Random Forest  Selection',
  0.6635300726800323,
  0.008996897621509825,
  0.7754493715043699),
 ('Near Miss Forward feature selection',
  0.6825778589234929,
  0.009857072449482503,
  0.8002694219218356)]

#### 5. One Sided Selection

In [12]:
from imblearn.under_sampling import OneSidedSelection

In [13]:
all_dataset['X_one_sided_selection'],all_dataset['y_one_sided_selection'] = OneSidedSelection().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_one_sided_selection'],all_dataset['y_one_sided_selection'],"One Sided Selection")

One Sided Selection RCF Completed
One Sided Selection Random Forest Selection Completed
One Sided Selection Forward feature selection Completed


In [14]:
all_results

[('Cluster centroids RCF',
  0.9212984094659598,
  0.036949516648764766,
  0.8994624122862425),
 ('Cluster centroids Random Forest  Selection',
  0.9923106632491837,
  0.2819672131147541,
  0.9350297308005926),
 ('Cluster centroids Forward feature selection',
  0.9918015519118009,
  0.26917057902973396,
  0.9347747364280545),
 ('Near Miss RCF',
  0.5980653769179453,
  0.007456539645380847,
  0.7375673644528155),
 ('Near Miss Random Forest  Selection',
  0.6635300726800323,
  0.008996897621509825,
  0.7754493715043699),
 ('Near Miss Forward feature selection',
  0.6825778589234929,
  0.009857072449482503,
  0.8002694219218356),
 ('One Sided Selection RCF',
  0.9993679997191109,
  0.8085106382978723,
  0.8876320013092462),
 ('One Sided Selection Random Forest  Selection',
  0.9993153330290369,
  0.7914438502673796,
  0.8774191267671952),
 ('One Sided Selection Forward feature selection',
  0.9993679997191109,
  0.8064516129032258,
  0.8825387534023176)]

#### 6. Tomek Links

In [15]:
from imblearn.under_sampling import TomekLinks

In [16]:
all_dataset['X_tomek_links'],all_dataset['y_tomek_links'] = TomekLinks().fit_resample(X_train,y_train)


In [17]:
training_testing_function(all_dataset['X_tomek_links'],all_dataset['y_tomek_links'],"Tomek Links")

Tomek Links RCF Completed
Tomek Links Random Forest Selection Completed
Tomek Links Forward feature selection Completed


In [18]:
all_results

[('Cluster centroids RCF',
  0.9212984094659598,
  0.036949516648764766,
  0.8994624122862425),
 ('Cluster centroids Random Forest  Selection',
  0.9923106632491837,
  0.2819672131147541,
  0.9350297308005926),
 ('Cluster centroids Forward feature selection',
  0.9918015519118009,
  0.26917057902973396,
  0.9347747364280545),
 ('Near Miss RCF',
  0.5980653769179453,
  0.007456539645380847,
  0.7375673644528155),
 ('Near Miss Random Forest  Selection',
  0.6635300726800323,
  0.008996897621509825,
  0.7754493715043699),
 ('Near Miss Forward feature selection',
  0.6825778589234929,
  0.009857072449482503,
  0.8002694219218356),
 ('One Sided Selection RCF',
  0.9993679997191109,
  0.8085106382978723,
  0.8876320013092462),
 ('One Sided Selection Random Forest  Selection',
  0.9993153330290369,
  0.7914438502673796,
  0.8774191267671952),
 ('One Sided Selection Forward feature selection',
  0.9993679997191109,
  0.8064516129032258,
  0.8825387534023176),
 ('Tomek Links RCF',
  0.999367999

### Oversampling data

#### 1. Random Oversampling

In [19]:
ros = RandomOverSampler(random_state=0)
all_dataset["X_train_random_oversampled"], all_dataset["y_train_random_oversampled"] = ros.fit_resample(X_train, y_train)
from collections import Counter
print(sorted(Counter(all_dataset["y_train_random_oversampled"]).items()))

[(0, 227451), (1, 227451)]


In [20]:
training_testing_function(all_dataset["X_train_random_oversampled"],all_dataset["y_train_random_oversampled"],"Random Oversampled")

Random Oversampled RCF Completed
Random Oversampled Random Forest Selection Completed
Random Oversampled Forward feature selection Completed


In [21]:
all_results

[('Cluster centroids RCF',
  0.9212984094659598,
  0.036949516648764766,
  0.8994624122862425),
 ('Cluster centroids Random Forest  Selection',
  0.9923106632491837,
  0.2819672131147541,
  0.9350297308005926),
 ('Cluster centroids Forward feature selection',
  0.9918015519118009,
  0.26917057902973396,
  0.9347747364280545),
 ('Near Miss RCF',
  0.5980653769179453,
  0.007456539645380847,
  0.7375673644528155),
 ('Near Miss Random Forest  Selection',
  0.6635300726800323,
  0.008996897621509825,
  0.7754493715043699),
 ('Near Miss Forward feature selection',
  0.6825778589234929,
  0.009857072449482503,
  0.8002694219218356),
 ('One Sided Selection RCF',
  0.9993679997191109,
  0.8085106382978723,
  0.8876320013092462),
 ('One Sided Selection Random Forest  Selection',
  0.9993153330290369,
  0.7914438502673796,
  0.8774191267671952),
 ('One Sided Selection Forward feature selection',
  0.9993679997191109,
  0.8064516129032258,
  0.8825387534023176),
 ('Tomek Links RCF',
  0.999367999

#### 2. SMOTE

In [22]:
from imblearn.over_sampling import SMOTE, ADASYN
all_dataset["X_smote"], all_dataset["y_smote"] = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(all_dataset["y_smote"]).items()))



[(0, 227451), (1, 227451)]


In [23]:
training_testing_function(all_dataset['X_smote'],all_dataset['y_smote'],"SMOTE")

SMOTE RCF Completed
SMOTE Random Forest Selection Completed
SMOTE Forward feature selection Completed


In [24]:
all_results

[('Cluster centroids RCF',
  0.9212984094659598,
  0.036949516648764766,
  0.8994624122862425),
 ('Cluster centroids Random Forest  Selection',
  0.9923106632491837,
  0.2819672131147541,
  0.9350297308005926),
 ('Cluster centroids Forward feature selection',
  0.9918015519118009,
  0.26917057902973396,
  0.9347747364280545),
 ('Near Miss RCF',
  0.5980653769179453,
  0.007456539645380847,
  0.7375673644528155),
 ('Near Miss Random Forest  Selection',
  0.6635300726800323,
  0.008996897621509825,
  0.7754493715043699),
 ('Near Miss Forward feature selection',
  0.6825778589234929,
  0.009857072449482503,
  0.8002694219218356),
 ('One Sided Selection RCF',
  0.9993679997191109,
  0.8085106382978723,
  0.8876320013092462),
 ('One Sided Selection Random Forest  Selection',
  0.9993153330290369,
  0.7914438502673796,
  0.8774191267671952),
 ('One Sided Selection Forward feature selection',
  0.9993679997191109,
  0.8064516129032258,
  0.8825387534023176),
 ('Tomek Links RCF',
  0.999367999

#### 3. ADASYN

In [25]:
all_dataset['X_adasyn'],all_dataset['y_adasyn'] = ADASYN().fit_resample(X_train, y_train)
print(sorted(Counter(all_dataset["y_adasyn"]).items()))

[(0, 227451), (1, 227448)]


In [26]:
training_testing_function(all_dataset['X_adasyn'],all_dataset['y_adasyn'],"ADASYN")

ADASYN RCF Completed
ADASYN Random Forest Selection Completed
ADASYN Forward feature selection Completed


In [27]:
all_results

[('Cluster centroids RCF',
  0.9212984094659598,
  0.036949516648764766,
  0.8994624122862425),
 ('Cluster centroids Random Forest  Selection',
  0.9923106632491837,
  0.2819672131147541,
  0.9350297308005926),
 ('Cluster centroids Forward feature selection',
  0.9918015519118009,
  0.26917057902973396,
  0.9347747364280545),
 ('Near Miss RCF',
  0.5980653769179453,
  0.007456539645380847,
  0.7375673644528155),
 ('Near Miss Random Forest  Selection',
  0.6635300726800323,
  0.008996897621509825,
  0.7754493715043699),
 ('Near Miss Forward feature selection',
  0.6825778589234929,
  0.009857072449482503,
  0.8002694219218356),
 ('One Sided Selection RCF',
  0.9993679997191109,
  0.8085106382978723,
  0.8876320013092462),
 ('One Sided Selection Random Forest  Selection',
  0.9993153330290369,
  0.7914438502673796,
  0.8774191267671952),
 ('One Sided Selection Forward feature selection',
  0.9993679997191109,
  0.8064516129032258,
  0.8825387534023176),
 ('Tomek Links RCF',
  0.999367999

#### 4. Variations of SMOTE

In [28]:
from imblearn.over_sampling import BorderlineSMOTE,SVMSMOTE,KMeansSMOTE

In [29]:
all_dataset["X_borderline_smote"],all_dataset["y_borderline_smote"] = BorderlineSMOTE().fit_resample(X_train,y_train)
all_dataset["X_svm_smote"],all_dataset["y_svm_smote"] = SVMSMOTE().fit_resample(X_train,y_train)

# X_kmeans_smote,y_kmeans_smote = KMeansSMOTE().fit_resample(X_train,y_train)


# all_dataset["X_kmeans_smote"],all_dataset["y_kmeans_smote"]= X_kmeans_smote,y_kmeans_smote

training_testing_function(all_dataset["X_borderline_smote"],all_dataset["y_borderline_smote"],"Borderline SMOTE")
training_testing_function(all_dataset["X_svm_smote"],all_dataset["y_svm_smote"],"SVM SMOTE")

# training_testing_function(all_dataset["X_kmeans_smote"],all_dataset["y_kmeans_smote"],"KMeans SMOTE")



Borderline SMOTE RCF Completed
Borderline SMOTE Random Forest Selection Completed
Borderline SMOTE Forward feature selection Completed
SVM SMOTE RCF Completed
SVM SMOTE Random Forest Selection Completed
SVM SMOTE Forward feature selection Completed


In [30]:
all_results

[('Cluster centroids RCF',
  0.9212984094659598,
  0.036949516648764766,
  0.8994624122862425),
 ('Cluster centroids Random Forest  Selection',
  0.9923106632491837,
  0.2819672131147541,
  0.9350297308005926),
 ('Cluster centroids Forward feature selection',
  0.9918015519118009,
  0.26917057902973396,
  0.9347747364280545),
 ('Near Miss RCF',
  0.5980653769179453,
  0.007456539645380847,
  0.7375673644528155),
 ('Near Miss Random Forest  Selection',
  0.6635300726800323,
  0.008996897621509825,
  0.7754493715043699),
 ('Near Miss Forward feature selection',
  0.6825778589234929,
  0.009857072449482503,
  0.8002694219218356),
 ('One Sided Selection RCF',
  0.9993679997191109,
  0.8085106382978723,
  0.8876320013092462),
 ('One Sided Selection Random Forest  Selection',
  0.9993153330290369,
  0.7914438502673796,
  0.8774191267671952),
 ('One Sided Selection Forward feature selection',
  0.9993679997191109,
  0.8064516129032258,
  0.8825387534023176),
 ('Tomek Links RCF',
  0.999367999

### Combination of Undersampling and Oversampling

#### 1. SMOTETomek

In [5]:
from imblearn.combine import SMOTETomek

In [6]:
all_dataset['X_smote_tomek'],all_dataset['y_smote_tomek'] = SMOTETomek().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_smote_tomek'],all_dataset['y_smote_tomek'],"SMOTE Tomek")

SMOTE Tomek RCF Completed
SMOTE Tomek Random Forest Selection Completed
SMOTE Tomek Forward feature selection Completed


In [7]:
all_results

[('SMOTE Tomek RCF',
  0.9976651100733822,
  0.5128205128205129,
  0.8562196016560818),
 ('SMOTE Tomek Random Forest  Selection',
  0.9991046662687406,
  0.767123287671233,
  0.9282460909237077),
 ('SMOTE Tomek Forward feature selection',
  0.9966468873986166,
  0.4558404558404559,
  0.9066420919802924)]

## Working with different dataset

In [6]:
dataset2 = pd.read_csv("dataset2.csv")
dataset2.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [7]:
dataset2.shape

(6362620, 11)

In [8]:
dataset2.isnull().values.any()

False

In [9]:
dataset2.drop("isFlaggedFraud",axis=1,inplace=True)


In [10]:
all_zero=list(dataset2.loc[dataset2["isFraud"]==0].index)

In [11]:
sample_zero = np.random.choice(all_zero,200000,replace=False)

In [12]:
all_one=list(dataset2.loc[dataset2["isFraud"]==1].index)

In [13]:
selected_sample = np.concatenate([sample_zero,all_one])

In [14]:
len(selected_sample)

208213

In [15]:
new_dataset2 = dataset2.loc[selected_sample]

In [16]:
new_dataset2.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
410191,18,PAYMENT,22850.78,C710129801,12573.0,0.0,M1501898196,0.0,0.0,0
3194556,249,PAYMENT,11295.63,C483785706,22971.0,11675.37,M775004462,0.0,0.0,0
6142488,545,CASH_IN,301213.13,C1969014000,10921773.42,11222986.55,C569154124,10232881.64,9931668.52,0
3563767,260,CASH_OUT,163724.2,C1782582199,22244.0,0.0,C699092914,1688281.41,1852005.6,0
692185,36,PAYMENT,15829.25,C1586700803,490.0,0.0,M1373803299,0.0,0.0,0


In [17]:
y=new_dataset2['isFraud']
new_dataset2.drop('isFraud',axis=1,inplace=True)
X=new_dataset2

Since the destination account balances being zero is a strong indicator of fraud, we do not impute the account balance (before the transaction is made) with a statistic or from a distribution with a subsequent adjustment for the amount transacted. Doing so would mask this indicator of fraud and make fraudulent transactions appear genuine. Instead, below we replace the value of 0 with -1 which will be more useful to a suitable machine-learning (ML) algorithm detecting fraud.

In [18]:
X.loc[(X.oldbalanceDest == 0) & (X.newbalanceDest == 0) & (X.amount != 0), ['oldbalanceDest', 'newbalanceDest']] = - 1

The data also has several transactions with zero balances in the originating account both before and after a non-zero amount is transacted. In this case, the fraction of such transactions is much smaller in fraudulent (0.3%) compared to genuine transactions (47%). Once again, from similar reasoning as above, instead of imputing a numerical value we replace the value of 0 with a null value.

In [19]:
# X.loc[(X.oldbalanceOrg == 0) & (X.newbalanceOrig == 0) & (X.amount != 0),['oldbalanceOrg', 'newbalanceOrig']] = np.nan

Motivated by the possibility of zero-balances serving to differentiate between fraudulent and genuine transactions, we create 2 new features (columns) recording errors in the originating and destination accounts for each transaction. These new features turn out to be important in obtaining the best performance from the ML algorithm that we will finally use.

In [20]:
X['errorbalanceOrg'] = X.newbalanceOrig + X.amount - X.oldbalanceOrg
X['errorbalanceDest'] = X.oldbalanceDest + X.amount - X.newbalanceDest

In [21]:
# log transformed amount
X['ln_amount'] = np.log1p(X['amount'])

In [22]:
# correcting few negative errors
X['errorbalanceOrg'] = X['errorbalanceOrg'].apply(lambda x: 0 if x < 0 else x)

# log transformed origin account balance error
X['ln_errorBalanceOrig'] = np.log1p(X['errorbalanceOrg'])

In [23]:
X['errorbalanceDest'] = X['errorbalanceDest'].apply(lambda x: 0 if x < 0 else x)

# log transformed origin account balance error
X['ln_errorBalanceDest'] = np.log1p(X['errorbalanceDest'])

In [24]:
X["type"].value_counts()

CASH_OUT    74504
PAYMENT     67545
CASH_IN     44130
TRANSFER    20684
DEBIT        1350
Name: type, dtype: int64

In [25]:
X["type_transfer"]=X["type"].replace({'CASH_OUT':0,'PAYMENT':1,'CASH_IN':2,'TRANSFER':3,'DEBIT':4})

In [26]:
X.drop("type",axis=1,inplace=True)

In [27]:
X.head()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,errorbalanceOrg,errorbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer
410191,18,22850.78,C710129801,12573.0,0.0,M1501898196,-1.0,-1.0,10277.78,22850.78,10.036784,9.237837,10.036784,1
3194556,249,11295.63,C483785706,22971.0,11675.37,M775004462,-1.0,-1.0,0.0,11295.63,9.33226,0.0,9.33226,1
6142488,545,301213.13,C1969014000,10921773.42,11222986.55,C569154124,10232881.64,9931668.52,602426.26,602426.25,12.615577,13.308722,13.308722,2
3563767,260,163724.2,C1782582199,22244.0,0.0,C699092914,1688281.41,1852005.6,141480.2,0.01,12.005945,11.859922,0.00995,0
692185,36,15829.25,C1586700803,490.0,0.0,M1373803299,-1.0,-1.0,15339.25,15829.25,9.669678,9.638235,9.669678,1


In [28]:
X.drop(["amount","errorbalanceOrg","errorbalanceDest"],axis=1,inplace=True)


In [29]:
X.head()

Unnamed: 0,step,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer
410191,18,C710129801,12573.0,0.0,M1501898196,-1.0,-1.0,10.036784,9.237837,10.036784,1
3194556,249,C483785706,22971.0,11675.37,M775004462,-1.0,-1.0,9.33226,0.0,9.33226,1
6142488,545,C1969014000,10921773.42,11222986.55,C569154124,10232881.64,9931668.52,12.615577,13.308722,13.308722,2
3563767,260,C1782582199,22244.0,0.0,C699092914,1688281.41,1852005.6,12.005945,11.859922,0.00995,0
692185,36,C1586700803,490.0,0.0,M1373803299,-1.0,-1.0,9.669678,9.638235,9.669678,1


In [30]:
X["type_name"]=X.step

In [31]:
X.loc[(X.nameOrig.str.contains("C")) & (X.nameDest.str.contains("C")),"type_name"]=0
X.loc[(X.nameOrig.str.contains("C")) & (X.nameDest.str.contains("M")),"type_name"]=1
X.loc[(X.nameOrig.str.contains("M")) & (X.nameDest.str.contains("C")),"type_name"]=2
X.loc[(X.nameOrig.str.contains("M")) & (X.nameDest.str.contains("M")),"type_name"]=3

In [32]:
X.tail()

Unnamed: 0,step,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer,type_name
6362615,743,C786484425,339682.13,0.0,C776919290,0.0,339682.13,12.735768,0.0,0.0,0,0
6362616,743,C1529008245,6311409.28,0.0,C1881841831,-1.0,-1.0,15.65787,0.0,15.65787,3,0
6362617,743,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,15.65787,0.0,0.009950331,0,0
6362618,743,C1685995037,850002.52,0.0,C2080388513,-1.0,-1.0,13.652996,0.0,13.653,3,0
6362619,743,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,13.652996,0.0,9.313226e-10,0,0


In [33]:
X.drop(["nameOrig","nameDest"],axis=1,inplace=True)

In [34]:
X.head()

Unnamed: 0,step,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer,type_name
410191,18,12573.0,0.0,-1.0,-1.0,10.036784,9.237837,10.036784,1,1
3194556,249,22971.0,11675.37,-1.0,-1.0,9.33226,0.0,9.33226,1,1
6142488,545,10921773.42,11222986.55,10232881.64,9931668.52,12.615577,13.308722,13.308722,2,0
3563767,260,22244.0,0.0,1688281.41,1852005.6,12.005945,11.859922,0.00995,0,0
692185,36,490.0,0.0,-1.0,-1.0,9.669678,9.638235,9.669678,1,1


In [35]:
#first proceed without step column
step_col = X.step
X.drop("step",axis=1,inplace=True)


In [36]:
X.head()

Unnamed: 0,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer,type_name
410191,12573.0,0.0,-1.0,-1.0,10.036784,9.237837,10.036784,1,1
3194556,22971.0,11675.37,-1.0,-1.0,9.33226,0.0,9.33226,1,1
6142488,10921773.42,11222986.55,10232881.64,9931668.52,12.615577,13.308722,13.308722,2,0
3563767,22244.0,0.0,1688281.41,1852005.6,12.005945,11.859922,0.00995,0,0
692185,490.0,0.0,-1.0,-1.0,9.669678,9.638235,9.669678,1,1


In [37]:
# dividing into training and testing

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0, stratify = y)

In [39]:
X.isnull().values.any()

False

In [40]:
# ss = StandardScaler()
# ss.fit(X_train.loc[:,"oldbalanceOrg"].values.reshape(-1,1))
# X_train.loc[:,"scaled_oldbalanceOrg"]= ss.transform(X_train.loc[:,"oldbalanceOrg"].values.reshape(-1,1))
# X_test.loc[:,"scaled_oldbalanceOrg"] = ss.transform(X_test.loc[:,"oldbalanceOrg"].values.reshape(-1,1))

# ss = StandardScaler()
# ss.fit(X_train.loc[:,"newbalanceOrig"].values.reshape(-1,1))
# X_train.loc[:,"scaled_newbalanceOrig"]= ss.transform(X_train.loc[:,"newbalanceOrig"].values.reshape(-1,1))
# X_test.loc[:,"scaled_newbalanceOrig"] = ss.transform(X_test.loc[:,"newbalanceOrig"].values.reshape(-1,1))

# ss = StandardScaler()
# ss.fit(X_train.loc[:,"oldbalanceDest"].values.reshape(-1,1))
# X_train.loc[:,"scaled_oldbalanceDest"]= ss.transform(X_train.loc[:,"oldbalanceDest"].values.reshape(-1,1))
# X_test.loc[:,"scaled_oldbalanceDest"] = ss.transform(X_test.loc[:,"oldbalanceDest"].values.reshape(-1,1))

# ss = StandardScaler()
# ss.fit(X_train.loc[:,"newbalanceDest"].values.reshape(-1,1))
# X_train.loc[:,"scaled_newbalanceDest"]= ss.transform(X_train.loc[:,"newbalanceDest"].values.reshape(-1,1))
# X_test.loc[:,"scaled_newbalanceDest"] = ss.transform(X_test.loc[:,"newbalanceDest"].values.reshape(-1,1))


cols=["oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest","ln_amount","ln_errorBalanceOrig","ln_errorBalanceDest"]
for c in cols:
    ss = StandardScaler()
    ss.fit(X_train.loc[:,str(c)].values.reshape(-1,1))
    transformed1 = ss.transform(X_train.loc[:,str(c)].values.reshape(-1,1))
    print(transformed1.shape)
    X_train["scaled_"+str(c)] = transformed1
    X_train.drop(c,axis=1,inplace=True)
    transformed2 = ss.transform(X_test.loc[:,str(c)].values.reshape(-1,1))
    X_test["scaled_"+str(c)] = transformed2
    X_test.drop(c,axis=1,inplace=True)
    


















(166570, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,co

(166570, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(166570, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,co

(166570, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(166570, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,co

(166570, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(166570, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [41]:
X_train.head()

Unnamed: 0,type_transfer,type_name,scaled_oldbalanceOrg,scaled_newbalanceOrig,scaled_oldbalanceDest,scaled_newbalanceDest,scaled_ln_amount,scaled_ln_errorBalanceOrig,scaled_ln_errorBalanceDest
240937,3,0,-0.296385,-0.286881,0.204165,0.430443,1.572934,1.0275,-1.098093
257089,2,0,-0.291391,-0.280304,-0.321572,-0.334316,-1.35468,0.097777,0.468263
4040151,0,0,-0.292276,-0.286881,-0.321572,-0.240526,0.845047,0.753697,-1.098093
1276758,0,0,-0.273548,-0.286881,0.829287,0.803669,0.928903,0.746227,-1.098093
4926349,3,0,-0.296385,-0.286881,2.281562,2.206544,1.266347,0.915958,-1.098093


In [42]:
X_train.to_csv("X_train_dataset2_preprocessed.csv",index=False)
X_test.to_csv("X_test_dataset2_preprocessed.csv",index=False)

In [43]:
y_train.to_csv("y_train_dataset2_preprocessed.csv",index=False)
y_test.to_csv("y_test_dataset2_preprocessed.csv",index=False)

  """Entry point for launching an IPython kernel.
  


In [52]:
X_train = pd.read_csv("X_train_dataset2_preprocessed.csv")
X_test = pd.read_csv("X_test_dataset2_preprocessed.csv")

y_train = pd.read_csv("y_train_dataset2_preprocessed.csv",header=None)
y_test = pd.read_csv("y_test_dataset2_preprocessed.csv",header=None)


#### 1. Random Undersample

In [94]:
undersample = RandomUnderSampler(sampling_strategy='majority')
# fit and apply the transform
all_dataset["X_train_undersample_dataset2"], all_dataset["y_train_undersample_dataset2"] = undersample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(all_dataset["y_train_undersample_dataset2"]))

Counter({0: 6570, 1: 6570})


In [95]:
training_testing_function(all_dataset["X_train_undersample_dataset2"],all_dataset["y_train_undersample_dataset2"],"Random Undersample dataset 2")

Random Undersample dataset 2 RCF Completed
Random Undersample dataset 2 Random Forest Selection Completed
Random Undersample dataset 2 Forward feature selection Completed


In [96]:
all_results

[('SMOTE Tomek RCF',
  0.9976651100733822,
  0.5128205128205129,
  0.8562196016560818),
 ('SMOTE Tomek Random Forest  Selection',
  0.9991046662687406,
  0.767123287671233,
  0.9282460909237077),
 ('SMOTE Tomek Forward feature selection',
  0.9966468873986166,
  0.4558404558404559,
  0.9066420919802924),
 ('Random Undersample dataset 2 RCF',
  0.9634032130249982,
  0.6817042606516291,
  0.9777399650030433),
 ('Random Undersample dataset 2 Random Forest  Selection',
  0.9647719904905987,
  0.690048594971477,
  0.9787442863664029),
 ('Random Undersample dataset 2 RCF',
  0.9638354585404509,
  0.6845412651864264,
  0.9785486077297626),
 ('Random Undersample dataset 2 Random Forest  Selection',
  0.9659006315587254,
  0.697098976109215,
  0.9796236077297628),
 ('Random Undersample dataset 2 Forward feature selection',
  0.974233364551065,
  0.7525939589578049,
  0.9833774650030432)]

#### 2. Instance Hardness Threshold

In [97]:
from imblearn.under_sampling import InstanceHardnessThreshold

In [98]:
iht = InstanceHardnessThreshold(sampling_strategy='majority', random_state=42)
all_dataset["X_train_res_dataset2"], all_dataset["y_train_res_dataset2"] = iht.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(all_dataset["y_train_res_dataset2"]))

Resampled dataset shape Counter({0: 158482, 1: 6570})


In [99]:
training_testing_function(all_dataset["X_train_res_dataset2"],all_dataset["y_train_res_dataset2"],"IHT dataset 2")

IHT dataset 2 RCF Completed
IHT dataset 2 Random Forest Selection Completed
IHT dataset 2 Forward feature selection Completed


In [100]:
all_results

[('SMOTE Tomek RCF',
  0.9976651100733822,
  0.5128205128205129,
  0.8562196016560818),
 ('SMOTE Tomek Random Forest  Selection',
  0.9991046662687406,
  0.767123287671233,
  0.9282460909237077),
 ('SMOTE Tomek Forward feature selection',
  0.9966468873986166,
  0.4558404558404559,
  0.9066420919802924),
 ('Random Undersample dataset 2 RCF',
  0.9634032130249982,
  0.6817042606516291,
  0.9777399650030433),
 ('Random Undersample dataset 2 Random Forest  Selection',
  0.9647719904905987,
  0.690048594971477,
  0.9787442863664029),
 ('Random Undersample dataset 2 RCF',
  0.9638354585404509,
  0.6845412651864264,
  0.9785486077297626),
 ('Random Undersample dataset 2 Random Forest  Selection',
  0.9659006315587254,
  0.697098976109215,
  0.9796236077297628),
 ('Random Undersample dataset 2 Forward feature selection',
  0.974233364551065,
  0.7525939589578049,
  0.9833774650030432),
 ('IHT dataset 2 RCF',
  0.9962058449199145,
  0.9535840188014102,
  0.992188572732806),
 ('IHT dataset 2 Ra

#### 3. Cluster Centroid

In [58]:
from imblearn.under_sampling import ClusterCentroids

In [59]:
all_dataset["X_cluster_centroids_dataset2"],all_dataset["y_cluster_centroids_dataset2"]  = ClusterCentroids().fit_resample(X_train,y_train)


KeyboardInterrupt



In [None]:
training_testing_function(all_dataset["X_cluster_centroids_dataset2"],all_dataset["y_cluster_centroids_dataset2"],"Cluster centroids dataset 2")

In [None]:
all_results

#### 4. Near Miss

In [79]:
from imblearn.under_sampling import NearMiss

In [80]:
all_dataset['X_near_miss_dataset2'],all_dataset['y_near_miss_dataset2'] = NearMiss().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_near_miss_dataset2'],all_dataset['y_near_miss_dataset2'],"Near Miss dataset 2")

Near Miss dataset 2 Completed


In [81]:
all_results

[('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.6855306527693245)]

#### 5. One Sided Selection

In [82]:
from imblearn.under_sampling import OneSidedSelection

In [83]:
all_dataset['X_one_sided_selection_dataset2'],all_dataset['y_one_sided_selection_dataset2'] = OneSidedSelection().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_one_sided_selection_dataset2'],all_dataset['y_one_sided_selection_dataset2'],"One Sided Selection dataset 2")

One Sided Selection dataset 2 Completed


In [84]:
all_results

[('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.6855306527693245),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268)]

#### 6. Tomek Links

In [85]:
from imblearn.under_sampling import TomekLinks

In [86]:
all_dataset['X_tomek_links_dataset2'],all_dataset['y_tomek_links_dataset2'] = TomekLinks().fit_resample(X_train,y_train)


In [87]:
training_testing_function(all_dataset['X_tomek_links_dataset2'],all_dataset['y_tomek_links_dataset2'],"Tomek Links dataset 2")

Tomek Links dataset 2 Completed


In [88]:
all_results

[('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.6855306527693245),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Tomek Links dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268)]

### Oversampling data

#### 1. Random Oversampling

In [89]:
ros = RandomOverSampler(random_state=0)
all_dataset["X_train_random_oversampled_dataset2"], all_dataset["y_train_random_oversampled_dataset2"] = ros.fit_resample(X_train, y_train)

print(sorted(Counter(all_dataset["y_train_random_oversampled_dataset2"]).items()))

[(0, 160000), (1, 160000)]


In [90]:
training_testing_function(all_dataset["X_train_random_oversampled_dataset2"],all_dataset["y_train_random_oversampled_dataset2"],"Random Oversampled dataset2")

Random Oversampled dataset2 Completed


In [91]:
all_results

[('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.6855306527693245),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Tomek Links dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Random Oversampled dataset2',
  0.9882093028840382,
  0.8695190007972363,
  0.991819750456482)]

#### 2. SMOTE

In [92]:
from imblearn.over_sampling import SMOTE, ADASYN
all_dataset["X_smote_dataset2"], all_dataset["y_smote_dataset2"] = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(all_dataset["y_smote_dataset2"]).items()))



[(0, 160000), (1, 160000)]


In [93]:
training_testing_function(all_dataset['X_smote_dataset2'],all_dataset['y_smote_dataset2'],"SMOTE dataset2")

SMOTE dataset2 Completed


In [94]:
all_results

[('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.6855306527693245),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Tomek Links dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Random Oversampled dataset2',
  0.9882093028840382,
  0.8695190007972363,
  0.991819750456482),
 ('SMOTE dataset2', 0.9852316115553634, 0.84178029328531, 0.990269750456482)]

#### 3. ADASYN

In [95]:
all_dataset['X_adasyn_dataset2'],all_dataset['y_adasyn_dataset2'] = ADASYN().fit_resample(X_train, y_train)
print(sorted(Counter(all_dataset["y_adasyn_dataset2"]).items()))

[(0, 160000), (1, 160103)]


In [96]:
training_testing_function(all_dataset['X_adasyn_dataset2'],all_dataset['y_adasyn_dataset2'],"ADASYN dataset2")

ADASYN dataset2 Completed


In [97]:
all_results

[('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.6855306527693245),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Tomek Links dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Random Oversampled dataset2',
  0.9882093028840382,
  0.8695190007972363,
  0.991819750456482),
 ('SMOTE dataset2', 0.9852316115553634, 0.84178029328531, 0.990269750456482),
 ('ADASYN dataset2',
  0.9785318060658454,
  0.785405664906385,
  0.9867822504564819)]

#### 4. Variations of SMOTE

In [98]:
from imblearn.over_sampling import BorderlineSMOTE,SVMSMOTE,KMeansSMOTE

In [99]:
all_dataset["X_borderline_smote_dataset2"],all_dataset["y_borderline_smote_dataset2"] = BorderlineSMOTE().fit_resample(X_train,y_train)
all_dataset["X_svm_smote_dataset2"],all_dataset["y_svm_smote_dataset2"] = SVMSMOTE().fit_resample(X_train,y_train)

# X_kmeans_smote,y_kmeans_smote = KMeansSMOTE().fit_resample(X_train,y_train)


# all_dataset["X_kmeans_smote"],all_dataset["y_kmeans_smote"]= X_kmeans_smote,y_kmeans_smote

training_testing_function(all_dataset["X_borderline_smote_dataset2"],all_dataset["y_borderline_smote_dataset2"],"Borderline SMOTE dataset2")
training_testing_function(all_dataset["X_svm_smote_dataset2"],all_dataset["y_svm_smote_dataset2"],"SVM SMOTE dataset2")

# training_testing_function(all_dataset["X_kmeans_smote"],all_dataset["y_kmeans_smote"],"KMeans SMOTE")



Borderline SMOTE dataset2 Completed
SVM SMOTE dataset2 Completed


In [100]:
all_results

[('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.6855306527693245),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Tomek Links dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Random Oversampled dataset2',
  0.9882093028840382,
  0.8695190007972363,
  0.991819750456482),
 ('SMOTE dataset2', 0.9852316115553634, 0.84178029328531, 0.990269750456482),
 ('ADASYN dataset2',
  0.9785318060658454,
  0.785405664906385,
  0.9867822504564819),
 ('Borderline SMOTE dataset2',
  0.9814854837547727,
  0.8093000247341083,
  0.9883197504564819),
 ('SVM SMOTE dataset2',
  0.9812693609970463,
  0.8074074074074075,
  0.9879154290931224)]

### Combination of Undersampling and Oversampling

#### 1. SMOTETomek

In [101]:
from imblearn.combine import SMOTETomek

In [102]:
all_dataset['X_smote_tomek_dataset2'],all_dataset['y_smote_tomek_dataset2'] = SMOTETomek().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_smote_tomek_dataset2'],all_dataset['y_smote_tomek_dataset2'],"SMOTE Tomek dataset2")

SMOTE Tomek dataset2 Completed


In [103]:
all_results

[('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.6855306527693245),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Tomek Links dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Random Oversampled dataset2',
  0.9882093028840382,
  0.8695190007972363,
  0.991819750456482),
 ('SMOTE dataset2', 0.9852316115553634, 0.84178029328531, 0.990269750456482),
 ('ADASYN dataset2',
  0.9785318060658454,
  0.785405664906385,
  0.9867822504564819),
 ('Borderline SMOTE dataset2',
  0.9814854837547727,
  0.8093000247341083,
  0.9883197504564819),
 ('SVM SMOTE dataset2',
  0.9812693609970463,
  0.8074074074074075,
  0.9879154290931224),
 ('SMOTE Tomek dataset2',
  0.984823379679658,
  0.8381147540983607,
  0.990057250456482)]

In [1]:
l1 = [('Random Undersample',
       0.9697342087707594,
  0.08975712777191129,
  0.918628801408014),
 ('IHT', 0.9989642217618764, 0.7281105990783411, 0.9027095081138815),
 ('Cluster centroids',
  0.9917839963484428,
  0.2664576802507837,
  0.929672695611728),
 ('One Sided Selection',
  0.9993153330290369,
  0.7914438502673796,
  0.8774191267671952),
('Tomek Links', 0.9992802219023208, 0.7783783783783784, 0.8672150451345422),
 ('SMOTE', 0.9988939995084443, 0.7272727272727273, 0.9281405760109334),
 ('ADASYN', 0.9984375548611355, 0.6482213438735178, 0.9177254645527316),
 ('Borderline SMOTE',
  0.9992802219023208,
  0.7960199004975125,
  0.9079610283899717),
 ('SVM SMOTE', 0.9992626663389628, 0.7878787878787878, 0.8977657396667164),
 ('SMOTE Tomek', 0.9989642217618764, 0.7377777777777779, 0.9230824997415963)
]

f1

In [2]:
sorted(l1,key=lambda x:x[1],reverse=True)

[('One Sided Selection',
  0.9993153330290369,
  0.7914438502673796,
  0.8774191267671952),
 ('Tomek Links', 0.9992802219023208, 0.7783783783783784, 0.8672150451345422),
 ('Borderline SMOTE',
  0.9992802219023208,
  0.7960199004975125,
  0.9079610283899717),
 ('SVM SMOTE', 0.9992626663389628, 0.7878787878787878, 0.8977657396667164),
 ('IHT', 0.9989642217618764, 0.7281105990783411, 0.9027095081138815),
 ('SMOTE Tomek', 0.9989642217618764, 0.7377777777777779, 0.9230824997415963),
 ('SMOTE', 0.9988939995084443, 0.7272727272727273, 0.9281405760109334),
 ('ADASYN', 0.9984375548611355, 0.6482213438735178, 0.9177254645527316),
 ('Cluster centroids',
  0.9917839963484428,
  0.2664576802507837,
  0.929672695611728),
 ('Random Undersample',
  0.9697342087707594,
  0.08975712777191129,
  0.918628801408014)]

auc

In [5]:
sorted(l1,key=lambda x:x[2],reverse=True)

[('Borderline SMOTE',
  0.9992802219023208,
  0.7960199004975125,
  0.9079610283899717),
 ('One Sided Selection',
  0.9993153330290369,
  0.7914438502673796,
  0.8774191267671952),
 ('SVM SMOTE', 0.9992626663389628, 0.7878787878787878, 0.8977657396667164),
 ('Tomek Links', 0.9992802219023208, 0.7783783783783784, 0.8672150451345422),
 ('SMOTE Tomek', 0.9989642217618764, 0.7377777777777779, 0.9230824997415963),
 ('IHT', 0.9989642217618764, 0.7281105990783411, 0.9027095081138815),
 ('SMOTE', 0.9988939995084443, 0.7272727272727273, 0.9281405760109334),
 ('ADASYN', 0.9984375548611355, 0.6482213438735178, 0.9177254645527316),
 ('Cluster centroids',
  0.9917839963484428,
  0.2664576802507837,
  0.929672695611728),
 ('Random Undersample',
  0.9697342087707594,
  0.08975712777191129,
  0.918628801408014)]

In [3]:

l2 = [('Random Undersample dataset 2',
  0.9653963451240305,
  0.69399023147165,
  0.9793611077297626),
 ('IHT dataset 2', 0.996109790360925, 0.9523248969982343, 0.9906794659160073),
('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.6855306527693245),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Tomek Links dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Random Oversampled dataset2',
  0.9882093028840382,
  0.8695190007972363,
  0.991819750456482),
 ('SMOTE dataset2', 0.9852316115553634, 0.84178029328531, 0.990269750456482),
 ('ADASYN dataset2',
  0.9785318060658454,
  0.785405664906385,
  0.9867822504564819),
 ('Borderline SMOTE dataset2',
  0.9814854837547727,
  0.8093000247341083,
  0.9883197504564819),
 ('SVM SMOTE dataset2',
  0.9812693609970463,
  0.8074074074074075,
  0.9879154290931224),
 ('SMOTE Tomek dataset2',
  0.984823379679658,
  0.8381147540983607,
  0.990057250456482)]

f1

In [4]:
sorted(l2,key=lambda x:x[1],reverse=True)

[('IHT dataset 2', 0.996109790360925, 0.9523248969982343, 0.9906794659160073),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Tomek Links dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Random Oversampled dataset2',
  0.9882093028840382,
  0.8695190007972363,
  0.991819750456482),
 ('SMOTE dataset2', 0.9852316115553634, 0.84178029328531, 0.990269750456482),
 ('SMOTE Tomek dataset2',
  0.984823379679658,
  0.8381147540983607,
  0.990057250456482),
 ('Borderline SMOTE dataset2',
  0.9814854837547727,
  0.8093000247341083,
  0.9883197504564819),
 ('SVM SMOTE dataset2',
  0.9812693609970463,
  0.8074074074074075,
  0.9879154290931224),
 ('ADASYN dataset2',
  0.9785318060658454,
  0.785405664906385,
  0.9867822504564819),
 ('Random Undersample dataset 2',
  0.9653963451240305,
  0.69399023147165,
  0.9793611077297626),
 ('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.685530652

auc

In [6]:
sorted(l2,key=lambda x:x[2],reverse=True)

[('IHT dataset 2', 0.996109790360925, 0.9523248969982343, 0.9906794659160073),
 ('One Sided Selection dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Tomek Links dataset 2',
  0.9960857767211776,
  0.9521010872759329,
  0.9912506086427268),
 ('Random Oversampled dataset2',
  0.9882093028840382,
  0.8695190007972363,
  0.991819750456482),
 ('SMOTE dataset2', 0.9852316115553634, 0.84178029328531, 0.990269750456482),
 ('SMOTE Tomek dataset2',
  0.984823379679658,
  0.8381147540983607,
  0.990057250456482),
 ('Borderline SMOTE dataset2',
  0.9814854837547727,
  0.8093000247341083,
  0.9883197504564819),
 ('SVM SMOTE dataset2',
  0.9812693609970463,
  0.8074074074074075,
  0.9879154290931224),
 ('ADASYN dataset2',
  0.9785318060658454,
  0.785405664906385,
  0.9867822504564819),
 ('Random Undersample dataset 2',
  0.9653963451240305,
  0.69399023147165,
  0.9793611077297626),
 ('Near Miss dataset 2',
  0.481089258698941,
  0.12126387702818103,
  0.685530652