In [1]:
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
#transactions = pd.read_csv('transactions_modified.csv')
transactions = pd.read_csv('transactions.csv')
transactions.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,8,CASH_OUT,158007.12,C424875646,0.0,0.0,C1298177219,474016.32,1618631.97,0
1,236,CASH_OUT,457948.3,C1342616552,0.0,0.0,C1323169990,2720411.37,3178359.67,0
2,37,CASH_IN,153602.99,C900876541,11160428.67,11314031.67,C608741097,3274930.56,3121327.56,0
3,331,CASH_OUT,49555.14,C177696810,10865.0,0.0,C462716348,0.0,49555.14,0
4,250,CASH_OUT,29648.02,C788941490,0.0,0.0,C1971700992,56933.09,86581.1,0


In [2]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            199999 non-null  int64  
 1   type            199999 non-null  object 
 2   amount          199999 non-null  float64
 3   nameOrig        199999 non-null  object 
 4   oldbalanceOrg   199999 non-null  float64
 5   newbalanceOrig  199999 non-null  float64
 6   nameDest        199999 non-null  object 
 7   oldbalanceDest  199999 non-null  float64
 8   newbalanceDest  199999 non-null  float64
 9   isFraud         199999 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 15.3+ MB


In [3]:
# Summary statistics on amount column
transactions['amount'].describe()

count    1.999990e+05
mean     1.802425e+05
std      6.255482e+05
min      0.000000e+00
25%      1.338746e+04
50%      7.426695e+04
75%      2.086376e+05
max      5.204280e+07
Name: amount, dtype: float64

In [4]:
# How many fraudulent transactions?
fraudulent_transactions = transactions[transactions['isFraud'] == 1]
print(len(fraudulent_transactions))

282


In [5]:
transactions['type'].unique()

array(['CASH_OUT', 'CASH_IN', 'PAYMENT', 'TRANSFER', 'DEBIT'],
      dtype=object)

In [6]:
# Create isPayment field
def payment_type_function(x):
    if x == 'PAYMENT':
        return 1
    elif x == 'DEBIT':
        return 1
    else:
        return 0

transactions['isPayment'] = transactions['type'].apply(payment_type_function)

# Create isMovement field
def movement_function(x):
    if x == 'CASH_OUT':
        return 1
    elif x == 'TRANSFER':
        return 1
    else:
        return 0

transactions['isMovement'] = transactions['type'].apply(movement_function)

# Create accountDiff field
transactions['accountDiff'] = abs(transactions['oldbalanceOrg'] - transactions['oldbalanceDest'])

In [7]:
# Create features and label variables

features = np.array(transactions[['amount', 'isPayment', 'isMovement', 'accountDiff']])
label = transactions['isFraud']

In [8]:
# Split dataset

features_train, features_test, label_train, label_test = train_test_split(features, label, test_size= 0.3)

In [9]:
# Normalize the features variables
normalizer = StandardScaler()
features_train = normalizer.fit_transform(features_train)

In [10]:
# Create logistic regression model

model = LogisticRegression()

# Fit the model to the training data
model.fit(features_train, label_train)

# Score the model on the training data
model.score(features_train, label_train)

0.9985642754591104

In [11]:
# Score the model on the test data
model.score(features_test, label_test)

0.7502

In [12]:
# Print the model coefficients
print(model.coef_)
print(model.intercept_)

[[ 0.24471605 -0.73693098  2.25975048 -0.68458033]]
[-9.00975488]


> It appears isPayment and isMovement are important features

In [13]:
# New transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])

# Create a new transaction
your_transaction = np.array([42350.00, 0.0, 0.0, 34240.00])

# Combine new transactions into a single array

sample_transactions = np.stack((transaction1, transaction2, transaction3, your_transaction))

In [14]:
# Normalize the new transactions
sample_transactions = normalizer.transform(sample_transactions)

In [15]:
# Predict fraud on the new transactions
model.predict(sample_transactions)


array([0, 0, 0, 0])

In [16]:
# Show probabilities on the new transactions
model.predict_proba(sample_transactions)

array([[9.96577545e-01, 3.42245541e-03],
       [9.99992370e-01, 7.62955875e-06],
       [9.99991713e-01, 8.28678871e-06],
       [9.99964890e-01, 3.51104299e-05]])