# Logistic regression

In [1]:
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data
df = pd.read_csv("utils/transactions.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            199999 non-null  int64  
 1   type            199999 non-null  object 
 2   amount          199999 non-null  float64
 3   nameOrig        199999 non-null  object 
 4   oldbalanceOrg   199999 non-null  float64
 5   newbalanceOrig  199999 non-null  float64
 6   nameDest        199999 non-null  object 
 7   oldbalanceDest  199999 non-null  float64
 8   newbalanceDest  199999 non-null  float64
 9   isFraud         199999 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 15.3+ MB


In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,8,CASH_OUT,158007.12,C424875646,0.0,0.0,C1298177219,474016.32,1618631.97,0
1,236,CASH_OUT,457948.3,C1342616552,0.0,0.0,C1323169990,2720411.37,3178359.67,0
2,37,CASH_IN,153602.99,C900876541,11160428.67,11314031.67,C608741097,3274930.56,3121327.56,0
3,331,CASH_OUT,49555.14,C177696810,10865.0,0.0,C462716348,0.0,49555.14,0
4,250,CASH_OUT,29648.02,C788941490,0.0,0.0,C1971700992,56933.09,86581.1,0


In [4]:
# Summary statistics on amount column
df["amount"].describe()

count    1.999990e+05
mean     1.802425e+05
std      6.255482e+05
min      0.000000e+00
25%      1.338746e+04
50%      7.426695e+04
75%      2.086376e+05
max      5.204280e+07
Name: amount, dtype: float64

In [5]:
# Create isPayment field
df["isPayment"] = [1 if ((x == "PAYMENT") | (x == "DEBIT")) else 0 for x in df["type"]]

# Create isMovement field
df["isMovement"] = [1 if ((x == "CASH_OUT") | (x == "TRANSFER")) else 0 for x in df["type"]]

# Create accountDiff field
df["accountDiff"] = abs(df["oldbalanceDest"] - df["oldbalanceOrg"])

In [6]:
# Create features and label variables
features = df[["amount", "isPayment", "isMovement", "accountDiff"]].values
label = df["isFraud"].values

# Split dataset
features_train, features_test, label_train, label_test = \
  train_test_split(features, label, test_size = 0.3)

# Normalize the features variables
scaler = StandardScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)

In [7]:
# Fit the model to the training data
logr = LogisticRegression()
logr.fit(features_train, label_train)

# Score the model on the training data
print(logr.score(features_train, label_train))

# Score the model on the test data
print(logr.score(features_test, label_test))

# Print the model coefficients
print(features_train)
print(logr.coef_)
  # Most important feature: amount
  # Least important feature: isMovement

0.9986357045407467
0.99835
[[-0.2844173   1.37248801 -0.87714558 -0.39443575]
 [-0.28579407  1.37248801 -0.87714558 -0.32589488]
 [ 4.48260206 -0.72860382  1.1400616   0.32394145]
 ...
 [-0.05480161 -0.72860382  1.1400616  -0.30438467]
 [-0.26625183  1.37248801 -0.87714558 -0.39443575]
 [-0.27965222  1.37248801 -0.87714558 -0.39443575]]
[[ 0.21202324 -0.72595058  2.246339   -0.53391072]]


In [8]:
# New transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])

# Create a new transaction
your_transaction = np.array([1000.0, 0.0, 1.0, 1000.0])

# Combine new transactions into a single array
sample_transactions = np.stack((transaction1, transaction2, transaction3, your_transaction))

# Normalize the new transactions
sample_transactions = scaler.transform(sample_transactions)

# Predict fraud on the new transactions
print(logr.predict(sample_transactions))

# Show probabilities on the new transactions
print(logr.predict_proba(sample_transactions))

[0 0 0 0]
[[9.96835723e-01 3.16427680e-03]
 [9.99992584e-01 7.41586897e-06]
 [9.99991873e-01 8.12719901e-06]
 [9.96949252e-01 3.05074769e-03]]
