In [1]:
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

  import pandas.util.testing as tm


#### Load Data
Data source: https://www.kaggle.com/datasets/ealaxi/paysim1?resource=download

In [4]:
df = pd.read_csv('PS_20174392719_1491204439457_log.csv')
print(df.head(), df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86

In [5]:
TotalFraud = df.isFraud.sum()
print(TotalFraud)

8213


In [47]:
# The data is imbalanced

#### EDA

The amount section looks important. 

In [7]:
df.amount.describe()

count    6.362620e+06
mean     1.798619e+05
std      6.038582e+05
min      0.000000e+00
25%      1.338957e+04
50%      7.487194e+04
75%      2.087215e+05
max      9.244552e+07
Name: amount, dtype: float64

In [12]:
df.type.unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [21]:
# Type column is object type with two tyoes of transaction
df['isPayment'] = 0
df['isPayment'][df['type'].isin(['PAYMENT','DEBIT'])] = 1

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,isPayment
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,1
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,1
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,1


In [23]:
df['isMovement'] = 0
df['isMovement'][df['type'].isin(['CASH_OUT', 'TRANSFER'])] = 1
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,isPayment,isMovement
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,1,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,1,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,1,0


In [24]:
# another key factor to investigate would be the difference in value between the origin and destination account.

df['accountDiff'] = np.abs(df['oldbalanceOrg'] - df['oldbalanceDest'])
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,isPayment,isMovement,accountDiff
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,1,0,170136.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,1,0,21249.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,1,181.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,21001.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,1,0,41554.0


#### Training and testing data

In [33]:
feat_columns = ['amount','isPayment','isMovement','accountDiff']
features = df[feat_columns]
label = df.isFraud

# Split the data

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.3)

#### Normalize the feature variables

In [35]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Logistic Regression

In [36]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [40]:
model.score(X_train, y_train)

0.9986880965927334

In [41]:
model.score(X_test, y_test)

0.9986944581529831

In [42]:
print(model.coef_, model.intercept_)

[[ 0.22059827 -1.03797328  3.61568183 -0.65991015]] [-10.84770134]


feat_columns = ['amount','isPayment','isMovement','accountDiff']

amount seems to be least significant and isMovement seems to be most significant contributor

In [44]:
# Check on new transactions:
# New transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])

sample_transactions = np.stack((transaction1,transaction2,transaction3))

sample_transactions = scaler.transform(sample_transactions)

In [45]:
model.predict(sample_transactions)

array([0, 1, 0])

In [46]:
model.predict_proba(sample_transactions)

array([[1., 0.],
       [0., 1.],
       [1., 0.]])

## Modified data

In [76]:
# Modified_data

df = pd.read_csv('transaction_modified.csv')

In [77]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isPayment,isMovement,accountDiff
0,206,CASH_OUT,62927.08,C473782114,0.0,0.0,C2096898696,649420.67,712347.75,0,0,1,649420.67
1,380,PAYMENT,32851.57,C1915112886,0.0,0.0,M916879292,0.0,0.0,0,1,0,0.0
2,570,CASH_OUT,1131750.38,C1396198422,1131750.38,0.0,C1612235515,313070.53,1444820.92,1,0,1,818679.85
3,184,CASH_OUT,60519.74,C982551468,60519.74,0.0,C1378644910,54295.32,182654.5,1,0,1,6224.42
4,162,CASH_IN,46716.01,C1759889425,7668050.6,7714766.61,C2059152908,2125468.75,2078752.75,0,0,0,5542581.85


In [78]:
TotalFraud = df.isFraud.sum()
print(TotalFraud)

282


In [81]:
df.shape

(1000, 13)

In [82]:
# Type column is object type with two tyoes of transaction
df['isPayment'] = 0
df['isPayment'][df['type'].isin(['PAYMENT','DEBIT'])] = 1

df['isMovement'] = 0
df['isMovement'][df['type'].isin(['CASH_OUT', 'TRANSFER'])] = 1

df['accountDiff'] = np.abs(df['oldbalanceOrg'] - df['oldbalanceDest'])

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isPayment,isMovement,accountDiff
0,206,CASH_OUT,62927.08,C473782114,0.0,0.0,C2096898696,649420.67,712347.75,0,0,1,649420.67
1,380,PAYMENT,32851.57,C1915112886,0.0,0.0,M916879292,0.0,0.0,0,1,0,0.0
2,570,CASH_OUT,1131750.38,C1396198422,1131750.38,0.0,C1612235515,313070.53,1444820.92,1,0,1,818679.85
3,184,CASH_OUT,60519.74,C982551468,60519.74,0.0,C1378644910,54295.32,182654.5,1,0,1,6224.42
4,162,CASH_IN,46716.01,C1759889425,7668050.6,7714766.61,C2059152908,2125468.75,2078752.75,0,0,0,5542581.85


In [83]:
feat_columns = ['amount','isPayment','isMovement','accountDiff']
features = df[feat_columns]
label = df.isFraud

# Split the data

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.3)

In [84]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [85]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [86]:
model.score(X_train, y_train)

0.8471428571428572

In [87]:
model.score(X_test, y_test)

0.8233333333333334

In [88]:
# Check on new transactions:
# New transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])

sample_transactions = np.stack((transaction1,transaction2,transaction3))

sample_transactions = scaler.transform(sample_transactions)

  "X does not have valid feature names, but"


In [89]:
model.predict(sample_transactions)

array([0, 0, 0])

In [90]:
model.predict_proba(sample_transactions)

array([[0.61267309, 0.38732691],
       [0.99809491, 0.00190509],
       [0.99619443, 0.00380557]])