In [3]:
import numpy as np
import pandas as pd
data = pd.read_csv('card.csv')
print(data.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0      0.0             0.0  
1  M2044282225             0.0             0.0      0.0             0.0  
2   C553264065             0.0             0.0      1.0             0.0  
3    C38997010         21182.0             0.0      1.0             0.0  
4  M1230701703             0.0             0.0      0.0             0.0  


In [4]:
print(data.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     1
newbalanceOrig    1
nameDest          1
oldbalanceDest    1
newbalanceDest    1
isFraud           1
isFlaggedFraud    1
dtype: int64


In [5]:
print(data.type.value_counts())

type
CASH_OUT    430421
PAYMENT     410077
CASH_IN     263977
TRANSFER    100524
DEBIT         8285
Name: count, dtype: int64


In [11]:
null_counts = data.isnull().sum()
print("Null values per column:")
print(null_counts[null_counts > 0])
print("\nPercentage of null values:")
print((data.isnull().mean() * 100).round(2)[null_counts > 0])

Null values per column:
oldbalanceOrg     1
newbalanceOrig    1
nameDest          1
oldbalanceDest    1
newbalanceDest    1
isFraud           1
isFlaggedFraud    1
dtype: int64

Percentage of null values:
oldbalanceOrg     0.0
newbalanceOrig    0.0
nameDest          0.0
oldbalanceDest    0.0
newbalanceDest    0.0
isFraud           0.0
isFlaggedFraud    0.0
dtype: float64


In [12]:
numeric_cols = ['oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
data[numeric_cols] = data[numeric_cols].fillna(0)

In [13]:
data['nameDest'] = data['nameDest'].fillna('UNKNOWN')

In [14]:
fraud_cols = ['isFraud', 'isFlaggedFraud']
data[fraud_cols] = data[fraud_cols].fillna(0)

In [15]:
print(data.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [19]:
print(data.type.value_counts())

type
CASH_OUT    430293
PAYMENT     409971
CASH_IN     263903
TRANSFER    100492
DEBIT         8280
Name: count, dtype: int64


In [20]:
type = data['type'].value_counts()
transactions = type.index
quantity = type.values
import plotly.express as px
fig = px.pie(data, values=quantity, names=transactions, hole = 0.5, title="Distribution of Transaction Type")
fig.show()

In [23]:
numeric_cols = ['oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
data[numeric_cols] = data[numeric_cols].fillna(0)
data['nameDest'] = data['nameDest'].fillna('UNKNOWN')
data[['isFraud', 'isFlaggedFraud']] = data[['isFraud', 'isFlaggedFraud']].fillna(0)

In [24]:
data = pd.get_dummies(data, columns=['type'])

In [25]:
data_for_corr = data.drop(columns=['nameOrig', 'nameDest'])

In [27]:
correlation = data_for_corr.corr()
print("\nCorrelation with isFraud:")
print(correlation['isFraud'].sort_values(ascending=False))


Correlation with isFraud:
isFraud           1.000000
amount            0.138712
type_TRANSFER     0.053084
step              0.022643
type_CASH_OUT     0.011108
oldbalanceOrg     0.004682
newbalanceDest   -0.000169
type_DEBIT       -0.002941
oldbalanceDest   -0.007979
newbalanceOrig   -0.010126
type_CASH_IN     -0.018704
type_PAYMENT     -0.025344
isFlaggedFraud         NaN
Name: isFraud, dtype: float64


In [29]:
type_mapping = {"CASH_OUT": 1, 'PAYMENT': 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5}
for type_name, type_value in type_mapping.items():
    column_name = f'type_{type_name}'
    if column_name in data.columns:
        data.loc[data[column_name] == 1, 'transaction_type'] = type_value
data['isFraud'] = data['isFraud'].map({0: 'No Fraud', 1: 'Fraud'})

In [30]:
print(data.head())

   step    amount     nameOrig  oldbalanceOrg  newbalanceOrig     nameDest  \
0     1   9839.64  C1231006815       170136.0       160296.36  M1979787155   
1     1   1864.28  C1666544295        21249.0        19384.72  M2044282225   
2     1    181.00  C1305486145          181.0            0.00   C553264065   
3     1    181.00   C840083671          181.0            0.00    C38997010   
4     1  11668.14  C2048537720        41554.0        29885.86  M1230701703   

   oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  type_CASH_IN  \
0             0.0             0.0  No Fraud             0.0         False   
1             0.0             0.0  No Fraud             0.0         False   
2             0.0             0.0     Fraud             0.0         False   
3         21182.0             0.0     Fraud             0.0         False   
4             0.0             0.0  No Fraud             0.0         False   

   type_CASH_OUT  type_DEBIT  type_PAYMENT  type_TRANSFER  transacti

In [33]:
from sklearn.model_selection import train_test_split
x = np.array(data[['type_CASH_OUT', 'type_PAYMENT', 'type_CASH_IN', 'type_TRANSFER', 'type_DEBIT','amount', 'oldbalanceOrg', 'newbalanceOrig']])
y = np.array(data['isFraud'])

In [34]:
from sklearn.tree import DecisionTreeClassifier
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.10, random_state=42)
model = DecisionTreeClassifier()
model.fit(x_train,y_train)
print(model.score(x_test,y_test))

0.9992992233746104


In [37]:
features = np.array([[0, 0, 0, 1, 0, 9000.60, 9000.60, 0.0]])
print(model.predict(features))

['Fraud']
