In [20]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [21]:
# Step 1
# Load the smaller data set
# transactions = pd.read_csv(r"C:\Users\becas\OneDrive\Documents\credit-card-fraud-prediction\transactions_modified_credit_card_fraud_prediction.csv")
# Load the larger data set
transactions = pd.read_csv(r"C:\Users\becas\OneDrive\Documents\credit-card-fraud-prediction\Large_transactions_modified_credit_card_fraud_prediction.csv")

print('Number of fraudulent transactions:', transactions['isFraud'].sum())

# Preview the first 5 rows
# print("First five rows of the dataset:")
# print(transactions.head())

# Get info about the dataset
print("\nDataset info:")
print(transactions.info())

Number of fraudulent transactions: 282

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            199999 non-null  int64  
 1   type            199999 non-null  object 
 2   amount          199999 non-null  float64
 3   nameOrig        199999 non-null  object 
 4   oldbalanceOrg   199999 non-null  float64
 5   newbalanceOrig  199999 non-null  float64
 6   nameDest        199999 non-null  object 
 7   oldbalanceDest  199999 non-null  float64
 8   newbalanceDest  199999 non-null  float64
 9   isFraud         199999 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 15.3+ MB
None


In [22]:
# Step 2
print(transactions['amount'].describe())

count    1.999990e+05
mean     1.802425e+05
std      6.255482e+05
min      0.000000e+00
25%      1.338746e+04
50%      7.426695e+04
75%      2.086376e+05
max      5.204280e+07
Name: amount, dtype: float64


In [23]:
# Step 3
# Start by creating the column and setting all values to 0
transactions['isPayment'] = 0

# Then set it to 1 where type is PAYMENT or DEBIT
transactions.loc[transactions['type'] == 'PAYMENT', 'isPayment'] = 1
transactions.loc[transactions['type'] == 'DEBIT', 'isPayment'] = 1


In [24]:
# Step 4
transactions['isMovement'] = 0
transactions.loc[transactions['type'] == 'CASH_OUT', 'isMovement'] = 1
transactions.loc[transactions['type'] == 'TRANSFER', 'isMovement'] = 1

In [25]:
# Step 5 
transactions['accountDiff'] = abs(transactions['oldbalanceOrg'] - transactions['oldbalanceDest'])

In [26]:
# Step 6
features = transactions[['amount','isPayment','isMovement', 'accountDiff']]
label = transactions['isFraud']

In [27]:
# Step 7
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.3, random_state = 42)

In [28]:
# Step 8
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [29]:
# Step 9
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
print("Model training complete!")

Model training complete!


In [30]:
# Step 10
train_score = model.score(X_train_scaled, y_train)
print("How well does the model do on the training data?")
print("Training accuracy:", round(train_score * 100, 2), "%")

How well does the model do on the training data?
Training accuracy: 99.86 %


In [31]:
# Step 11
test_score = model.score(X_test_scaled, y_test)
print("Test accuracy:", round(test_score * 100, 2), "%")

Test accuracy: 99.86 %


In [32]:
# Step 12
print("Feature coefficients:", model.coef_)

Feature coefficients: [[ 0.27085886 -0.86017198  2.1629665  -0.93256293]]


In [33]:
# Step 13 
# New transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])

# Create your own transaction — feel free to change the values!
your_transaction = np.array([25000.00, 0.0, 1.0, 18000.0])

print("Sample transactions ready for prediction.")

Sample transactions ready for prediction.


In [34]:
# Step 14
sample_transactions = np.array([
    transaction1,
    transaction2,
    transaction3,
    your_transaction
])
# Optional: check the shape
print("Shape of sample_transactions:", sample_transactions.shape)

Shape of sample_transactions: (4, 4)


In [35]:
# Step 15
sample_transactions = scaler.transform(sample_transactions)



In [36]:
# Step 16
# Predict if each transaction is fraudulent (1) or not (0)
predictions = model.predict(sample_transactions)
print("Predicted fraud flags:", predictions)

# Get fraud probabilities for each transaction
probabilities = model.predict_proba(sample_transactions)
print("Fraud probabilities (per transaction):")
print(probabilities)

Predicted fraud flags: [0 0 0 0]
Fraud probabilities (per transaction):
[[9.96528861e-01 3.47113881e-03]
 [9.99992733e-01 7.26678035e-06]
 [9.99992107e-01 7.89307951e-06]
 [9.96650395e-01 3.34960467e-03]]


In [37]:
# Step 17 - return to step one to load the larger data set 