In [1]:
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
transactions = pd.read_csv('transactions_modified.csv')
print(transactions.head())

   step      type      amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0   206  CASH_OUT    62927.08   C473782114           0.00            0.00   
1   380   PAYMENT    32851.57  C1915112886           0.00            0.00   
2   570  CASH_OUT  1131750.38  C1396198422     1131750.38            0.00   
3   184  CASH_OUT    60519.74   C982551468       60519.74            0.00   
4   162   CASH_IN    46716.01  C1759889425     7668050.60      7714766.61   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isPayment  \
0  C2096898696       649420.67       712347.75        0          0   
1   M916879292            0.00            0.00        0          1   
2  C1612235515       313070.53      1444820.92        1          0   
3  C1378644910        54295.32       182654.50        1          0   
4  C2059152908      2125468.75      2078752.75        0          0   

   isMovement  accountDiff  
0           1    649420.67  
1           0         0.00  
2           1    818679.85  


In [5]:
print(transactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            1000 non-null   int64  
 1   type            1000 non-null   object 
 2   amount          1000 non-null   float64
 3   nameOrig        1000 non-null   object 
 4   oldbalanceOrg   1000 non-null   float64
 5   newbalanceOrig  1000 non-null   float64
 6   nameDest        1000 non-null   object 
 7   oldbalanceDest  1000 non-null   float64
 8   newbalanceDest  1000 non-null   float64
 9   isFraud         1000 non-null   int64  
 10  isPayment       1000 non-null   int64  
 11  isMovement      1000 non-null   int64  
 12  accountDiff     1000 non-null   float64
dtypes: float64(6), int64(4), object(3)
memory usage: 101.7+ KB
None


In [6]:
print(transactions.describe())

              step        amount  oldbalanceOrg  newbalanceOrig  \
count  1000.000000  1.000000e+03   1.000000e+03    1.000000e+03   
mean    280.664000  5.373080e+05   1.049284e+06    6.376146e+05   
std     167.174593  1.423692e+06   3.226500e+06    2.717351e+06   
min       1.000000  0.000000e+00   0.000000e+00    0.000000e+00   
25%     164.000000  2.933705e+04   1.287500e+02    0.000000e+00   
50%     261.000000  1.265305e+05   4.371400e+04    0.000000e+00   
75%     373.250000  3.010378e+05   4.080914e+05    3.795663e+04   
max     741.000000  1.000000e+07   5.039905e+07    4.039905e+07   

       oldbalanceDest  newbalanceDest      isFraud    isPayment   isMovement  \
count    1.000000e+03    1.000000e+03  1000.000000  1000.000000  1000.000000   
mean     1.028848e+06    1.302326e+06     0.282000     0.220000     0.605000   
std      2.678541e+06    3.038042e+06     0.450198     0.414454     0.489095   
min      0.000000e+00    0.000000e+00     0.000000     0.000000     0.000000

In [7]:
# How many fraudulent transactions?
display(np.sum(transactions.isFraud))

282


In [10]:
# Summary statistics on amount column
display(transactions.amount.describe())

count    1.000000e+03
mean     5.373080e+05
std      1.423692e+06
min      0.000000e+00
25%      2.933705e+04
50%      1.265305e+05
75%      3.010378e+05
max      1.000000e+07
Name: amount, dtype: float64

type
CASH_OUT    400
PAYMENT     214
TRANSFER    205
CASH_IN     175
DEBIT         6
Name: count, dtype: int64

In [11]:
# Create isPayment field
display(transactions.type.value_counts())
transactions['isPayment'] = transactions['type'].apply(lambda x: 1 if x in ['PAYMENT', 'DEBIT'] else 0)
display(transactions.isPayment.value_counts())

type
CASH_OUT    400
PAYMENT     214
TRANSFER    205
CASH_IN     175
DEBIT         6
Name: count, dtype: int64

isPayment
0    780
1    220
Name: count, dtype: int64

In [12]:
# Create isMovement field
transactions['isMovement'] = transactions['type'].apply(lambda x: 1 if x in ['CASH_OUT', 'TRANSFER'] else 0)
display(transactions.isMovement.value_counts())

isMovement
1    605
0    395
Name: count, dtype: int64

In [17]:
# Create accountDiff field
transactions['accountDiff'] = transactions['oldbalanceOrg'] - transactions['oldbalanceDest']

# Code to test that accounttDiff worked correctly
test_list = ['accountDiff', 'oldbalanceOrg', 'oldbalanceDest']

for item in test_list:
    display(transactions[item][2])



818679.8499999999

1131750.38

313070.53

In [18]:
# Create features and label variables
features = transactions[['amount', 'isPayment', 'isMovement', 'accountDiff']]

label = transactions['isFraud']

In [23]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3)

# Normalize the features variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit the model to the training data
model = LogisticRegression()
model.fit(X_train, y_train)

# Score the model on the training data
display(model.score(X_train, y_train))

# Score the model on the test data
display(model.score(X_test, y_test))

# Print the model coefficients
display(model.coef_)

0.8414285714285714

0.8733333333333333

array([[ 1.91570153, -0.26730454,  2.90621003,  1.03281935]])

In [28]:
# New transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])

# Create a new transaction
your_transaction = np.array([14585.24, 1.0, 1.0, 768547.5])

# Combine new transactions into a single array
sample_transactions = np.array([transaction1, transaction2, transaction3, your_transaction])
display(sample_transactions)

# Normalize the new transactions
sample_transactions = scaler.transform(sample_transactions)

# Predict fraud on the new transactions
display(model.predict(sample_transactions))

# Show probabilities on the new transactions
display(model.predict_proba(sample_transactions))

array([[1.2345678e+05, 0.0000000e+00, 1.0000000e+00, 5.4670100e+04],
       [9.8765430e+04, 1.0000000e+00, 0.0000000e+00, 8.5247500e+03],
       [5.4367831e+05, 1.0000000e+00, 0.0000000e+00, 5.1002550e+05],
       [1.4585240e+04, 1.0000000e+00, 1.0000000e+00, 7.6854750e+05]])



array([0, 0, 0, 0], dtype=int64)

array([[6.09140310e-01, 3.90859690e-01],
       [9.99177941e-01, 8.22059040e-04],
       [9.98277865e-01, 1.72213474e-03],
       [7.46698024e-01, 2.53301976e-01]])

In [33]:
display(np.sum(model.predict(X_test)))
display(np.sum(model.predict(X_train)))

51

112