## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from pandas.tools.plotting import scatter_matrix
from sklearn.decomposition import PCA
from imblearn.combine import SMOTEENN
import seaborn as sns

## 1. Load Dataset

In [2]:
data = pd.read_csv('dataset.csv',nrows = 50000)

In [3]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


### 1.2 Dataset Information

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
step              50000 non-null int64
type              50000 non-null object
amount            50000 non-null float64
nameOrig          50000 non-null object
oldbalanceOrg     50000 non-null float64
newbalanceOrig    50000 non-null float64
nameDest          50000 non-null object
oldbalanceDest    50000 non-null float64
newbalanceDest    50000 non-null float64
isFraud           50000 non-null int64
isFlaggedFraud    50000 non-null int64
dtypes: float64(5), int64(3), object(3)
memory usage: 4.2+ MB


### 1.2 Dataset Description

In [5]:
data.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,7.4538,156264.5,733308.3,747274.1,844467.1,1164400.0,0.002,0.0
std,2.056636,324394.9,2202405.0,2239971.0,2433388.0,2915203.0,0.044677,0.0
min,1.0,0.63,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.0,7734.572,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,33414.82,17030.72,0.0,2126.0,0.0,0.0,0.0
75%,9.0,187839.0,133171.6,143873.4,463205.0,864926.4,0.0,0.0
max,9.0,10000000.0,28547240.0,28617400.0,30143200.0,31976990.0,1.0,0.0


## 2. Pre-Processing

### 2.1 Feature Engineering

### make some kind of feature engineering split the first character on the   nameDest to indicate the customer from the marchant .as shown below only the customer how make fraud 

In [6]:
data['char']=data['nameDest'].apply(lambda x: x[0])

In [7]:
data = data[[ 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest','char', 'isFraud']]
data.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,char,isFraud
0,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,M,0
1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,M,0
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,C,1
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,C,1
4,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,M,0


In [8]:
data.groupby(['char',"isFraud"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
char,isFraud,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C,0,27988,27988,27988,27988,27988,27988
C,1,100,100,100,100,100,100
M,0,21912,21912,21912,21912,21912,21912


In [9]:
data['isFraud'].value_counts()

0    49900
1      100
Name: isFraud, dtype: int64

### 2.2 Missing Values Check

In [10]:
data.isnull().sum()

type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
char              0
isFraud           0
dtype: int64

### 2.3 Label Encoding

### transfrom from strings types to numeric ones to fit in the model 

In [11]:
data['type'] = LabelEncoder().fit_transform(data['type'])
data['char'] = LabelEncoder().fit_transform(data['char'])
data.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,char,isFraud
0,3,9839.64,170136.0,160296.36,0.0,0.0,1,0
1,3,1864.28,21249.0,19384.72,0.0,0.0,1,0
2,4,181.0,181.0,0.0,0.0,0.0,0,1
3,1,181.0,181.0,0.0,21182.0,0.0,0,1
4,3,11668.14,41554.0,29885.86,0.0,0.0,1,0


## 3. Model Buliding

### 3.1 Split Data into Training & Test

In [12]:
features = data.iloc[:,[0,1,2,3,4,5,6]].values
labels = data.iloc[:,-1].values
validation_size = 0.3
seed = 0
scoring = 'accuracy'
x_train, x_validation, y_train, y_validation = model_selection.train_test_split(features,labels, test_size=validation_size, random_state=seed)
print("Train Size is: ",len(x_train))
print("Test Size is: ",len(x_validation))

Train Size is:  35000
Test Size is:  15000


### Over Sampling for training

### in the fraud , we need to uncrease the fraud class(outliers) thee are 3 ways to make this 
### 1 - to oversample the fraud class but it can lead us to outliers 
### 2 - to downsample the other class : to decrease the samples but it may lead us to data leakage
### 3 - the last one and the most common , to oversample and clean to to remove outliers if exist using     SMOTEENN class

In [13]:
sme=SMOTEENN(random_state=42, ratio =1)
x_sme, y_sme = sme.fit_sample(x_train, y_train)
print(y_sme)

[0 0 0 ..., 1 1 1]


### Over Sampling for testing

In [14]:
sme=SMOTEENN(random_state=42, ratio =1)
xt_sme, yt_sme = sme.fit_sample(x_validation, y_validation)
print(yt_sme)

[0 0 0 ..., 1 1 1]


### 3.2 Cross Validation

In [15]:
## Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF',RandomForestClassifier()))
models.append(('NB', GaussianNB()))
# models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10,random_state=seed)
    cv_results = model_selection.cross_val_score(model, x_sme, y_sme, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.962173 (0.019936)
LDA: 0.877077 (0.111327)
KNN: 0.996129 (0.003222)
CART: 0.998035 (0.001135)
RF: 0.999403 (0.000562)
NB: 0.608615 (0.377735)


### 3.3 Model Building

In [17]:
RF = RandomForestClassifier(n_estimators=5)
RF.fit(x_sme, y_sme)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## 4. Model Evaluation

In [18]:
y_pred = RF.predict(xt_sme)
print(accuracy_score(yt_sme, y_pred))
print(confusion_matrix(yt_sme, y_pred))
print(classification_report(yt_sme, y_pred))

0.903807003257
[[14546    55]
 [ 2780 12091]]
              precision    recall  f1-score   support

           0       0.84      1.00      0.91     14601
           1       1.00      0.81      0.90     14871

   micro avg       0.90      0.90      0.90     29472
   macro avg       0.92      0.90      0.90     29472
weighted avg       0.92      0.90      0.90     29472



## 5. Model Serialization

In [20]:
pickle.dump(RF, open('finalModel', 'wb'))