In [1]:
import pandas as pd
import numpy as np

#### Load the data
We'll use Pandas to read the data from file

In [2]:
df = pd.read_csv('data.csv')

#### Get to know your data set
We can start by exploring the various data types in the set

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
step              int64
type              object
amount            float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB



From our data, it is clear that our feature set most likely should consist of the column step, to newbalanceDest while the label column is isFraud
Let's create two variables holding the X and Y values for our model

In [4]:
X = df[df.columns[:9]]
Y = df[df.columns[9:10]]


#### Transform the data
Tranforming the data includes multiple steps, e.g.:
- Replacing any missing values, infinity or NaN
- Transforming textual features
- Normalize the feature space

##### Explore missing values
Running the following command indicates that we do not have any missing values

In [5]:
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

##### Transform textual features
The type, nameOrig and nameDest are textual features which we need to transform to a shape the machine learning algorithm can understand.
The type column has finitive options, so we can do something called OneHotEncoding on the column to create binary columns.

Pandas provide a very useful function for this, get_dummies. We can utilize this function to create new binary columns, drop the original column and concatenate the two dataframes.

In [6]:
X = pd.concat([X, pd.get_dummies(X['type'], prefix='type')], axis=1)

X.drop(['type'],axis=1, inplace=True)
X.head(10)

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,1,0
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,1,0
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,0,0,0,0,1
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,0,1,0,0,0
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,1,0
5,1,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0,0,1,0
6,1,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0,0,1,0
7,1,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0,0,1,0
8,1,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0,0,1,0
9,1,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0,1,0,0


#### Transforming nameOrig and nameDest
The following two columns to transform are nameOrig and nameDest
We can use a label encoder for this.

In [8]:
from sklearn.preprocessing import LabelEncoder

categorical_feature_mask = X.dtypes==object
categorical_cols = X.columns[categorical_feature_mask].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))
X.head(10)

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,757869,170136.0,160296.36,1662094,0.0,0.0,0,0,0,1,0
1,1,1864.28,2188998,21249.0,19384.72,1733924,0.0,0.0,0,0,0,1,0
2,1,181.0,1002156,181.0,0.0,439685,0.0,0.0,0,0,0,0,1
3,1,181.0,5828262,181.0,0.0,391696,21182.0,0.0,0,1,0,0,0
4,1,11668.14,3445981,41554.0,29885.86,828919,0.0,0.0,0,0,0,1,0
5,1,7817.71,6026525,53860.0,46042.29,2247218,0.0,0.0,0,0,0,1,0
6,1,7107.77,1805947,183195.0,176087.23,2063363,0.0,0.0,0,0,0,1,0
7,1,7861.64,2999171,176087.23,168225.59,2314008,0.0,0.0,0,0,0,1,0
8,1,4024.36,869140,2671.0,0.0,768940,0.0,0.0,0,0,0,1,0
9,1,5337.77,5407276,41720.0,36382.23,282960,41898.0,40348.79,0,0,1,0,0


#### Split the data
To evaluate the performance of the model, we will need a dataset for training and one for testing

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
X_train.shape

(4262955, 13)

#### Train the model

In [10]:
Y_train = Y_train.values.flatten()
Y_test = Y_test.values.flatten()

In [11]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, Y_train)

KeyboardInterrupt: 

#### Evaluate the model
Once the model has been trained, lets evaluate how accurate the model is

#### Accuracy

In [None]:
metrics = clf.score(X_test, Y_test)

print(metrics)


Not bad, 99.9% accurate! But remember that the dataset was highly unbalanced (we can just guess non-fraudulent on all transactions and get 99.9% accuracy)
Let's look at other metrics

#### Precision-Recall

- **Precision**: High precision means that all transactions we flagged as fraudulent actually where (no false positives)
- **Recall**: High recall means that we did not miss any fraudulent transactions (no false negatives)

There is a trade off between both, lets see how we can illustrate that in a precision-recall curve

In [None]:
from sklearn.metrics import classification_report
target_names = ['non-fraudulent', 'fraudulent']

pred = clf.predict(X_test)
print(classification_report(Y_test,pred, target_names=target_names))

Not bad! We can see that for fraudulent transactions, our recall is very high, but precision is low. This means that we create quite a few false positives but that is better than missing fraudulent activity

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

Y_score = clf.predict_proba(X_test)
Y_score = Y_score[:, 1]
Y_score = Y_score.flatten()

precisions, recalls, thresholds = precision_recall_curve(Y_test, Y_score)

plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
plt.legend(loc="center left")
plt.ylim([0, 1])
plt.xlim([0, 1])

plt.show()

#### Iterate
So our model is far from great, let's see if we can improve it. 
One of the issues we are seeing right now is that despite it's accuracy, it is missing a lot of fraudulent transactions.
A struggle we have is that our dataset is highly imbalanced. We can address this by weighing our inputs

Let's train our model again, but boosting our minority class, is fraudulent

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_with_weights = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, class_weight="balanced")
clf_with_weights.fit(X_train, Y_train)

In [None]:
metrics = clf_with_weights.score(X_test, Y_test)

print(metrics)

Our accuarcy dropped, but let's see if we can outweight that with better recall-precision

In [None]:
from sklearn.metrics import classification_report
target_names = ['non-fraudulent', 'fraudulent']

pred_with_weights = clf_with_weights.predict(X_test)
print(classification_report(Y_test, pred_with_weights, target_names=target_names))

Wow! We can see that especially recall greatly increased for the fraudulent cases, meaning that we miss a lot less false negatives

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

Y_score = clf_with_weights.predict_proba(X_test)
Y_score = Y_score[:, 1]
Y_score = Y_score.flatten()

precisions, recalls, thresholds = precision_recall_curve(Y_test, Y_score)

plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
plt.legend(loc="center left")
plt.ylim([0, 1])
plt.xlim([0, 1])

plt.show()