In [1]:
import numpy as np
import pandas as pd

from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB 
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler,StandardScaler

In [2]:
data = pd.read_csv("creditcard.csv")

data.head()

data['Class'].value_counts()

len(data['Class'])

print("percentage of fraudulent data instances: {}".format(data['Class'].value_counts()[1] *100 /len(data['Class'])))
print("percentage of normal data instances: {}".format(data['Class'].value_counts()[0] *100 /len(data['Class'])))

# Rescaling the data



rs = RobustScaler()

data['scaled_amount'] = rs.fit_transform(data['Amount'].values.reshape(-1, 1))
data['scaled_time'] = rs.fit_transform(data['Time'].values.reshape(-1, 1))

data.drop(['Amount', 'Time'], axis = 1, inplace = True)


# from sklearn.linear_model import LogisticRegression

X = data.drop(['Class'], axis = 1)
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0, 
                                                    stratify = y)

print(X_train.shape, y_train.shape)
print(y_train.value_counts())



percentage of fraudulent data instances: 0.1727485630620034
percentage of normal data instances: 99.827251436938
(227845, 30) (227845,)
0    227451
1       394
Name: Class, dtype: int64


In [36]:
all_results=[]
all_dataset=dict()

In [None]:
 def training_testing_function(X,y,name):
        corr = X.corr()
        columns = np.full((corr.shape[0],), True, dtype=bool)
        for i in range(corr.shape[0]):
            for j in range(i+1, corr.shape[0]):
                if corr.iloc[i,j] >= 0.9:
                    if columns[j]:
                        columns[j] = False
        selected_columns = X.columns[columns]
        X = X[selected_columns]


        rfc = RandomForestClassifier();

        # fit random forest classifier on the training set
        rfc.fit(X, y.values.ravel());
        # extract important features
        score = np.round(rfc.feature_importances_,3)

        importances = pd.DataFrame({'feature':X.columns,'importance':score})
        importances = importances.sort_values('importance',ascending=False).set_index('feature')

        min_features=min(len(importances),20)
        
        final_features=list(importances[:20].index)

        X=X[final_features]

        # Train KNeighborsClassifier Model
        KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
        # KNN_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel()); 

        # Train LogisticRegression Model
        LGR_Classifier = LogisticRegression(multi_class='auto', random_state=1,solver='lbfgs',max_iter=400)
        # LGR_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel());

        # Train Gaussian Naive Baye Model
        GNB_Classifier = GaussianNB()
        # GNB_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel())

        # Train Decision Tree Model
        DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
        # DTC_Classifier.fit(X_train_random_oversampled, y_train_random_oversampled.values.ravel());

        # Train RandomForestClassifier Model
        #RF_Classifier = RandomForestClassifier(criterion='entropy', n_jobs=-1, random_state=0)
        #RF_Classifier.fit(X_train, Y_train);  

        # Train SVM Model
        #SVC_Classifier = SVC(random_state=0)
        #SVC_Classifier.fit(X_train, Y_train)

        ## Train Ensemble Model (This method combines all the individual models above except RandomForest)
        combined_model = [('Naive Baye Classifier', GNB_Classifier), 
                         ('Decision Tree Classifier', DTC_Classifier), 
                         ('KNeighborsClassifier', KNN_Classifier), 
                         ('LogisticRegression', LGR_Classifier)
                        ]
        VC =  VotingClassifier(estimators = combined_model,voting = 'soft')
        VC.fit(X, y.values.ravel());

        models = []
        #models.append(('SVM Classifier', SVC_Classifier))
        # models.append(('Naive Baye Classifier', GNB_Classifier))
        # models.append(('Decision Tree Classifier', DTC_Classifier))
        #models.append(('RandomForest Classifier', RF_Classifier))
        # models.append(('KNeighborsClassifier', KNN_Classifier))
        # models.append(('LogisticRegression', LGR_Classifier))
        models.append(('VotingClassifier', VC))

        for i, v in models:
    #         accuracy = metrics.accuracy_score(y_test.values.ravel(), v.predict(X_test[final_features]))
    #         confusion_matrix = metrics.confusion_matrix(y_test.values.ravel(), v.predict(X_test[final_features]))
    #         classification = metrics.classification_report(y_test.values.ravel(), v.predict(X_test[final_features]))
            f1 = metrics.f1_score(y_test.values.ravel(),v.predict(X_test[final_features]))
            all_results.append((name,f1))
            print("{} Completed".format(name))
    #         print('============================== {} Model Test Results =============================='.format(i))
    #         print()
    #         print ("Model Accuracy:" "\n", accuracy)
    #         print()
    #         print("Confusion matrix:" "\n", confusion_matrix)
    #         print()
    #         print("Classification report:" "\n", classification) 
    #         print()        






#### 1. Random Undersample

In [4]:
undersample = RandomUnderSampler(sampling_strategy='majority')
# fit and apply the transform
all_dataset["X_train_undersample"], all_dataset["y_train_undersample"] = undersample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(all_dataset["y_train_undersample"]))

Counter({0: 394, 1: 394})


In [None]:
training_testing_function(all_dataset["X_train_undersample"],all_dataset["y_train_undersample"],"Random Undersample")

#### 2. Instance Hardness Threshold

In [5]:
from imblearn.under_sampling import InstanceHardnessThreshold

In [6]:
iht = InstanceHardnessThreshold(sampling_strategy='majority', random_state=42)
all_dataset["X_train_res"], all_dataset["y_train_res"] = iht.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(all_dataset["y_train_res"]))

Resampled dataset shape Counter({0: 222288, 1: 394})


In [6]:
training_testing_function(all_dataset["X_train_res"],all_dataset["y_train_res"],"IHT")

#### 3. Cluster Centroid

In [31]:
from imblearn.under_sampling import ClusterCentroids

In [32]:
all_dataset["X_cluster_centroids"],all_dataset["y_cluster_centroids"]  = ClusterCentroids().fit_resample(X_train,y_train)

In [34]:
training_testing_function(all_dataset["X_cluster_centroids"],all_dataset["y_cluster_centroids"],"Cluster centroids")

Cluster centroids Completed


In [36]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498),
 ('Cluster centroids', 0.3110307414104882)]

#### 4. Near Miss

In [39]:
from imblearn.under_sampling import NearMiss

In [40]:
all_dataset['X_near_miss'],all_dataset['y_near_miss'] = NearMiss().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_near_miss'],all_dataset['y_near_miss'],"Near Miss")

Near Miss Completed


In [41]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498),
 ('Cluster centroids', 0.3110307414104882),
 ('Near Miss', 0.007315406567788095)]

#### 5. One Sided Selection

In [43]:
from imblearn.under_sampling import OneSidedSelection

In [44]:
all_dataset['X_one_sided_selection'],all_dataset['y_one_sided_selection'] = OneSidedSelection().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_one_sided_selection'],all_dataset['y_one_sided_selection'],"One Sided Selection")

One Sided Selection Completed


In [45]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498),
 ('Cluster centroids', 0.3110307414104882),
 ('Near Miss', 0.007315406567788095),
 ('One Sided Selection', 0.7789473684210526)]

#### 6. Tomek Links

In [4]:
from imblearn.under_sampling import TomekLinks

In [None]:
all_dataset['X_tomek_links'],all_dataset['y_tomek_links'] = TomekLinks().fit_resample(X_train,y_train)


In [11]:
training_testing_function(all_dataset['X_tomek_links'],all_dataset['y_tomek_links'],"Tomek Links")

Tomek Links Completed


In [12]:
all_results

[('Tomek Links', 0.7914438502673796)]

### Oversampling data

#### 1. Random Oversampling

In [17]:
ros = RandomOverSampler(random_state=0)
all_dataset["X_train_random_oversampled"], all_dataset["y_train_random_oversampled"] = ros.fit_resample(X_train, y_train)
from collections import Counter
print(sorted(Counter(all_dataset["y_train_random_oversampled"]).items()))

[(0, 227451), (1, 227451)]


In [None]:
training_testing_function(all_dataset["X_train_random_oversampled"],all_dataset["y_train_random_oversampled"],"Random Oversampled")

#### 2. SMOTE

In [12]:
from imblearn.over_sampling import SMOTE, ADASYN
all_dataset["X_smote"], all_dataset["y_smote"] = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(all_dataset["y_smote"]).items()))



[(0, 227451), (1, 227451)]


In [16]:
training_testing_function(all_dataset['X_smote'],all_dataset['y_smote'],"SMOTE")

SMOTE Completed


In [17]:
all_results

[('SMOTE', 0.7545454545454545)]

#### 3. ADASYN

In [20]:
all_dataset['X_adasyn'],all_dataset['y_adasyn'] = ADASYN().fit_resample(X_train, y_train)
print(sorted(Counter(all_dataset["y_adasyn"]).items()))

[(0, 227451), (1, 227448)]


In [22]:
training_testing_function(all_dataset['X_adasyn'],all_dataset['y_adasyn'],"ADASYN")

ADASYN Completed


In [23]:
all_results

[('SMOTE', 0.7545454545454545), ('ADASYN', 0.6259541984732825)]

#### 4. Variations of SMOTE

In [24]:
from imblearn.over_sampling import BorderlineSMOTE,SVMSMOTE,KMeansSMOTE

In [29]:
all_dataset["X_borderline_smote"],all_dataset["y_borderline_smote"] = BorderlineSMOTE().fit_resample(X_train,y_train)
all_dataset["X_svm_smote"],all_dataset["y_svm_smote"] = SVMSMOTE().fit_resample(X_train,y_train)

# X_kmeans_smote,y_kmeans_smote = KMeansSMOTE().fit_resample(X_train,y_train)


# all_dataset["X_kmeans_smote"],all_dataset["y_kmeans_smote"]= X_kmeans_smote,y_kmeans_smote

training_testing_function(all_dataset["X_borderline_smote"],all_dataset["y_borderline_smote"],"Borderline SMOTE")
training_testing_function(all_dataset["X_svm_smote"],all_dataset["y_svm_smote"],"SVM SMOTE")

# training_testing_function(all_dataset["X_kmeans_smote"],all_dataset["y_kmeans_smote"],"KMeans SMOTE")



Borderline SMOTE Completed
SVM SMOTE Completed


In [30]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498)]

### Combination of Undersampling and Oversampling

#### 1. SMOTETomek

In [13]:
from imblearn.combine import SMOTETomek

In [14]:
all_dataset['X_smote_tomek'],all_dataset['y_smote_tomek'] = SMOTETomek().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_smote_tomek'],all_dataset['y_smote_tomek'],"SMOTE Tomek")

SMOTE Tomek Completed


In [15]:
all_results

[('Tomek Links', 0.7914438502673796), ('SMOTE Tomek', 0.7248908296943231)]

## Working with different dataset

In [2]:
dataset2 = pd.read_csv("dataset2.csv")
dataset2.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
dataset2.shape

(6362620, 11)

In [4]:
dataset2.isnull().values.any()

False

In [5]:
dataset2.drop("isFlaggedFraud",axis=1,inplace=True)


In [6]:
y=dataset2['isFraud']
dataset2.drop('isFraud',axis=1,inplace=True)
X=dataset2

Since the destination account balances being zero is a strong indicator of fraud, we do not impute the account balance (before the transaction is made) with a statistic or from a distribution with a subsequent adjustment for the amount transacted. Doing so would mask this indicator of fraud and make fraudulent transactions appear genuine. Instead, below we replace the value of 0 with -1 which will be more useful to a suitable machine-learning (ML) algorithm detecting fraud.

In [7]:
X.loc[(X.oldbalanceDest == 0) & (X.newbalanceDest == 0) & (X.amount != 0), ['oldbalanceDest', 'newbalanceDest']] = - 1

The data also has several transactions with zero balances in the originating account both before and after a non-zero amount is transacted. In this case, the fraction of such transactions is much smaller in fraudulent (0.3%) compared to genuine transactions (47%). Once again, from similar reasoning as above, instead of imputing a numerical value we replace the value of 0 with a null value.

In [8]:
# X.loc[(X.oldbalanceOrg == 0) & (X.newbalanceOrig == 0) & (X.amount != 0),['oldbalanceOrg', 'newbalanceOrig']] = np.nan

Motivated by the possibility of zero-balances serving to differentiate between fraudulent and genuine transactions, we create 2 new features (columns) recording errors in the originating and destination accounts for each transaction. These new features turn out to be important in obtaining the best performance from the ML algorithm that we will finally use.

In [9]:
X['errorbalanceOrg'] = X.newbalanceOrig + X.amount - X.oldbalanceOrg
X['errorbalanceDest'] = X.oldbalanceDest + X.amount - X.newbalanceDest

In [10]:
# log transformed amount
X['ln_amount'] = np.log1p(X['amount'])

In [11]:
# correcting few negative errors
X['errorbalanceOrg'] = X['errorbalanceOrg'].apply(lambda x: 0 if x < 0 else x)

# log transformed origin account balance error
X['ln_errorBalanceOrig'] = np.log1p(X['errorbalanceOrg'])

In [12]:
X['errorbalanceDest'] = X['errorbalanceDest'].apply(lambda x: 0 if x < 0 else x)

# log transformed origin account balance error
X['ln_errorBalanceDest'] = np.log1p(X['errorbalanceDest'])

In [13]:
X["type"].value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [14]:
X["type_transfer"]=X["type"].replace({'CASH_OUT':0,'PAYMENT':1,'CASH_IN':2,'TRANSFER':3,'DEBIT':4})

In [15]:
X.drop("type",axis=1,inplace=True)

In [16]:
X.head()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,errorbalanceOrg,errorbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,-1.0,-1.0,0.0,9839.64,9.194276,0.0,9.194276,1
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,-1.0,-1.0,0.0,1864.28,7.531166,0.0,7.531166,1
2,1,181.0,C1305486145,181.0,0.0,C553264065,-1.0,-1.0,0.0,181.0,5.204007,0.0,5.204007,3
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,0.0,21363.0,5.204007,0.0,9.969463,0
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,-1.0,-1.0,0.0,11668.14,9.364703,0.0,9.364703,1


In [17]:
X.drop(["amount","errorbalanceOrg","errorbalanceDest"],axis=1,inplace=True)


In [18]:
X.head()

Unnamed: 0,step,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer
0,1,C1231006815,170136.0,160296.36,M1979787155,-1.0,-1.0,9.194276,0.0,9.194276,1
1,1,C1666544295,21249.0,19384.72,M2044282225,-1.0,-1.0,7.531166,0.0,7.531166,1
2,1,C1305486145,181.0,0.0,C553264065,-1.0,-1.0,5.204007,0.0,5.204007,3
3,1,C840083671,181.0,0.0,C38997010,21182.0,0.0,5.204007,0.0,9.969463,0
4,1,C2048537720,41554.0,29885.86,M1230701703,-1.0,-1.0,9.364703,0.0,9.364703,1


In [19]:
X["type_name"]=X.step

In [20]:
X.loc[(X.nameOrig.str.contains("C")) & (X.nameDest.str.contains("C")),"type_name"]=0
X.loc[(X.nameOrig.str.contains("C")) & (X.nameDest.str.contains("M")),"type_name"]=1
X.loc[(X.nameOrig.str.contains("M")) & (X.nameDest.str.contains("C")),"type_name"]=2
X.loc[(X.nameOrig.str.contains("M")) & (X.nameDest.str.contains("M")),"type_name"]=3

In [21]:
X.tail()

Unnamed: 0,step,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer,type_name
6362615,743,C786484425,339682.13,0.0,C776919290,0.0,339682.13,12.735768,0.0,0.0,0,0
6362616,743,C1529008245,6311409.28,0.0,C1881841831,-1.0,-1.0,15.65787,0.0,15.65787,3,0
6362617,743,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,15.65787,0.0,0.009950331,0,0
6362618,743,C1685995037,850002.52,0.0,C2080388513,-1.0,-1.0,13.652996,0.0,13.653,3,0
6362619,743,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,13.652996,0.0,9.313226e-10,0,0


In [22]:
X.drop(["nameOrig","nameDest"],axis=1,inplace=True)

In [23]:
X.head()

Unnamed: 0,step,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer,type_name
0,1,170136.0,160296.36,-1.0,-1.0,9.194276,0.0,9.194276,1,1
1,1,21249.0,19384.72,-1.0,-1.0,7.531166,0.0,7.531166,1,1
2,1,181.0,0.0,-1.0,-1.0,5.204007,0.0,5.204007,3,0
3,1,181.0,0.0,21182.0,0.0,5.204007,0.0,9.969463,0,0
4,1,41554.0,29885.86,-1.0,-1.0,9.364703,0.0,9.364703,1,1


In [24]:
#first proceed without step column
step_col = X.step
X.drop("step",axis=1,inplace=True)


In [25]:
X.head()

Unnamed: 0,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,ln_amount,ln_errorBalanceOrig,ln_errorBalanceDest,type_transfer,type_name
0,170136.0,160296.36,-1.0,-1.0,9.194276,0.0,9.194276,1,1
1,21249.0,19384.72,-1.0,-1.0,7.531166,0.0,7.531166,1,1
2,181.0,0.0,-1.0,-1.0,5.204007,0.0,5.204007,3,0
3,181.0,0.0,21182.0,0.0,5.204007,0.0,9.969463,0,0
4,41554.0,29885.86,-1.0,-1.0,9.364703,0.0,9.364703,1,1


In [26]:
# dividing into training and testing

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0, stratify = y)

In [28]:
X.isnull().values.any()

False

In [29]:
# ss = StandardScaler()
# ss.fit(X_train.loc[:,"oldbalanceOrg"].values.reshape(-1,1))
# X_train.loc[:,"scaled_oldbalanceOrg"]= ss.transform(X_train.loc[:,"oldbalanceOrg"].values.reshape(-1,1))
# X_test.loc[:,"scaled_oldbalanceOrg"] = ss.transform(X_test.loc[:,"oldbalanceOrg"].values.reshape(-1,1))

# ss = StandardScaler()
# ss.fit(X_train.loc[:,"newbalanceOrig"].values.reshape(-1,1))
# X_train.loc[:,"scaled_newbalanceOrig"]= ss.transform(X_train.loc[:,"newbalanceOrig"].values.reshape(-1,1))
# X_test.loc[:,"scaled_newbalanceOrig"] = ss.transform(X_test.loc[:,"newbalanceOrig"].values.reshape(-1,1))

# ss = StandardScaler()
# ss.fit(X_train.loc[:,"oldbalanceDest"].values.reshape(-1,1))
# X_train.loc[:,"scaled_oldbalanceDest"]= ss.transform(X_train.loc[:,"oldbalanceDest"].values.reshape(-1,1))
# X_test.loc[:,"scaled_oldbalanceDest"] = ss.transform(X_test.loc[:,"oldbalanceDest"].values.reshape(-1,1))

# ss = StandardScaler()
# ss.fit(X_train.loc[:,"newbalanceDest"].values.reshape(-1,1))
# X_train.loc[:,"scaled_newbalanceDest"]= ss.transform(X_train.loc[:,"newbalanceDest"].values.reshape(-1,1))
# X_test.loc[:,"scaled_newbalanceDest"] = ss.transform(X_test.loc[:,"newbalanceDest"].values.reshape(-1,1))


cols=["newbalanceOrig","oldbalanceDest","newbalanceDest","ln_amount","ln_errorBalanceOrig","ln_errorBalanceDest"]
for c in cols:
    ss = StandardScaler()
    ss.fit(X_train.loc[:,str(c)].values.reshape(-1,1))
    transformed1 = ss.transform(X_train.loc[:,str(c)].values.reshape(-1,1))
    print(transformed1.shape)
    X_train["scaled_"+str(c)] = transformed1
    X_train.drop(c,axis=1,inplace=True)
    transformed2 = ss.transform(X_test.loc[:,str(c)].values.reshape(-1,1))
    X_test["scaled_"+str(c)] = transformed2
    


















(5090096, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(5090096, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(5090096, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(5090096, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(5090096, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(5090096, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [30]:
X_train.head()

Unnamed: 0,oldbalanceOrg,type_transfer,type_name,scaled_newbalanceOrig,scaled_oldbalanceDest,scaled_newbalanceDest,scaled_ln_amount,scaled_ln_errorBalanceOrig,scaled_ln_errorBalanceDest
1676258,1795445.96,2,0,0.375444,0.720231,0.589706,0.616397,0.756734,1.281672
1532444,40146.0,1,1,-0.281507,-0.32252,-0.331817,-0.967398,-1.815307,0.608824
4275262,0.0,0,0,-0.292224,0.717033,0.647739,0.1665,0.449845,-1.104619
1998321,391333.71,0,0,-0.160904,-0.282204,-0.292531,-1.059221,-1.815307,-1.104619
2613451,0.0,1,1,-0.292224,-0.32252,-0.331817,-1.849223,-0.293825,0.306983


In [32]:
ss=StandardScaler()
ss.fit(X_train["oldbalanceOrg"].values.reshape(-1,1))
X_train["scaled_oldbalanceOrg"]= ss.transform(X_train["oldbalanceOrg"].values.reshape(-1,1))
X_train.drop("oldbalanceOrg",axis=1,inplace=True)
X_test["scaled_oldbalanceOrg"]= ss.transform(X_test["oldbalanceOrg"].values.reshape(-1,1))
X_test.drop("oldbalanceOrg",axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [33]:
X_train.head()


Unnamed: 0,type_transfer,type_name,scaled_newbalanceOrig,scaled_oldbalanceDest,scaled_newbalanceDest,scaled_ln_amount,scaled_ln_errorBalanceOrig,scaled_ln_errorBalanceDest,scaled_oldbalanceOrg
1676258,2,0,0.375444,0.720231,0.589706,0.616397,0.756734,1.281672,0.333363
1532444,1,1,-0.281507,-0.32252,-0.331817,-0.967398,-1.815307,0.608824,-0.274603
4275262,0,0,-0.292224,0.717033,0.647739,0.1665,0.449845,-1.104619,-0.288508
1998321,0,0,-0.160904,-0.282204,-0.292531,-1.059221,-1.815307,-1.104619,-0.152965
2613451,1,1,-0.292224,-0.32252,-0.331817,-1.849223,-0.293825,0.306983,-0.288508


In [34]:
X_train.to_csv("X_train_dataset2_preprocessed.csv")
X_test.to_csv("X_test_dataset2_preprocessed.csv")

In [35]:
y_train.to_csv("y_train_dataset2_preprocessed.csv")
y_test.to_csv("y_test_dataset2_preprocessed.csv")

  """Entry point for launching an IPython kernel.
  


#### 1. Random Undersample

In [38]:
undersample = RandomUnderSampler(sampling_strategy='majority')
# fit and apply the transform
all_dataset["X_train_undersample_dataset2"], all_dataset["y_train_undersample_dataset2"] = undersample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(all_dataset["y_train_undersample_dataset2"]))

Counter({0: 6570, 1: 6570})


In [39]:
training_testing_function(all_dataset["X_train_undersample_dataset2"],all_dataset["y_train_undersample_dataset2"],"Random Undersample dataset 2")

Random Undersample dataset 2 Completed


In [40]:
all_results

[('Random Undersample dataset 2', 0.06860114055686012)]

#### 2. Instance Hardness Threshold

In [42]:
from imblearn.under_sampling import InstanceHardnessThreshold

In [43]:
iht = InstanceHardnessThreshold(sampling_strategy='majority', random_state=42)
all_dataset["X_train_res_dataset2"], all_dataset["y_train_res_dataset2"] = iht.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(all_dataset["y_train_res_dataset2"]))

KeyboardInterrupt: 

In [6]:
training_testing_function(all_dataset["X_train_res_dataset2"],all_dataset["y_train_res_dataset2"],"IHT dataset 2")

#### 3. Cluster Centroid

In [44]:
from imblearn.under_sampling import ClusterCentroids

In [None]:
all_dataset["X_cluster_centroids_dataset2"],all_dataset["y_cluster_centroids_dataset2"]  = ClusterCentroids().fit_resample(X_train,y_train)

In [34]:
training_testing_function(all_dataset["X_cluster_centroids"],all_dataset["y_cluster_centroids"],"Cluster centroids")

Cluster centroids Completed


In [36]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498),
 ('Cluster centroids', 0.3110307414104882)]

#### 4. Near Miss

In [39]:
from imblearn.under_sampling import NearMiss

In [40]:
all_dataset['X_near_miss'],all_dataset['y_near_miss'] = NearMiss().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_near_miss'],all_dataset['y_near_miss'],"Near Miss")

Near Miss Completed


In [41]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498),
 ('Cluster centroids', 0.3110307414104882),
 ('Near Miss', 0.007315406567788095)]

#### 5. One Sided Selection

In [43]:
from imblearn.under_sampling import OneSidedSelection

In [44]:
all_dataset['X_one_sided_selection'],all_dataset['y_one_sided_selection'] = OneSidedSelection().fit_resample(X_train,y_train)
training_testing_function(all_dataset['X_one_sided_selection'],all_dataset['y_one_sided_selection'],"One Sided Selection")

One Sided Selection Completed


In [45]:
all_results

[('SMOTE', 0.7545454545454545),
 ('ADASYN', 0.6259541984732825),
 ('Borderline SMOTE', 0.7843137254901962),
 ('SVM SMOTE', 0.7839195979899498),
 ('Cluster centroids', 0.3110307414104882),
 ('Near Miss', 0.007315406567788095),
 ('One Sided Selection', 0.7789473684210526)]

#### 6. Tomek Links

In [4]:
from imblearn.under_sampling import TomekLinks

In [None]:
all_dataset['X_tomek_links'],all_dataset['y_tomek_links'] = TomekLinks().fit_resample(X_train,y_train)


In [11]:
training_testing_function(all_dataset['X_tomek_links'],all_dataset['y_tomek_links'],"Tomek Links")

Tomek Links Completed


In [12]:
all_results

[('Tomek Links', 0.7914438502673796)]