In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kelvinobiri/credit-card-transactions")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/kelvinobiri/credit-card-transactions?dataset_version_number=1...


100%|██████████| 5.96M/5.96M [00:03<00:00, 1.61MB/s]

Extracting files...
Path to dataset files: /Users/mohandsabry/.cache/kagglehub/datasets/kelvinobiri/credit-card-transactions/versions/1





In [3]:
import pandas  as pd
data=pd.read_csv(path + "/transactions.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            199999 non-null  int64  
 1   type            199999 non-null  object 
 2   amount          199999 non-null  float64
 3   nameOrig        199999 non-null  object 
 4   oldbalanceOrg   199999 non-null  float64
 5   newbalanceOrig  199999 non-null  float64
 6   nameDest        199999 non-null  object 
 7   oldbalanceDest  199999 non-null  float64
 8   newbalanceDest  199999 non-null  float64
 9   isFraud         199999 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 15.3+ MB


In [4]:
# Check for duplicates
data.duplicated().sum()

np.int64(0)

In [5]:
# Check for balance
data['isFraud'].value_counts(normalize=True)

isFraud
0    0.99859
1    0.00141
Name: proportion, dtype: float64

In [6]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,8,CASH_OUT,158007.12,C424875646,0.0,0.0,C1298177219,474016.32,1618631.97,0
1,236,CASH_OUT,457948.3,C1342616552,0.0,0.0,C1323169990,2720411.37,3178359.67,0
2,37,CASH_IN,153602.99,C900876541,11160428.67,11314031.67,C608741097,3274930.56,3121327.56,0
3,331,CASH_OUT,49555.14,C177696810,10865.0,0.0,C462716348,0.0,49555.14,0
4,250,CASH_OUT,29648.02,C788941490,0.0,0.0,C1971700992,56933.09,86581.1,0


In [7]:
# Drop Irrelevant columns
data.drop(columns=['nameOrig', 'nameDest'], inplace=True)

In [11]:
from sklearn.model_selection import train_test_split
X= data.drop(columns=['isFraud'])
y=data['isFraud']
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

Train shape: (139999, 7)
Validation shape: (30000, 7)
Test shape: (30000, 7)


In [12]:
print("Class distribution in Training set:")
print(y_train.value_counts(normalize=True))

print("\nClass distribution in Validation set:")
print(y_val.value_counts(normalize=True))

print("\nClass distribution in Test set:")
print(y_test.value_counts(normalize=True))

Class distribution in Training set:
isFraud
0    0.998593
1    0.001407
Name: proportion, dtype: float64

Class distribution in Validation set:
isFraud
0    0.998567
1    0.001433
Name: proportion, dtype: float64

Class distribution in Test set:
isFraud
0    0.9986
1    0.0014
Name: proportion, dtype: float64


In [13]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
categorical_features = ['type']
numeric_features = [col for col in X_train.columns if col not in categorical_features]


In [14]:
# Define transformers
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numeric_transformer = StandardScaler()

In [15]:
# Combine them into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ]
)


In [16]:
# Fit only on training set, transform all
X_train_prep = preprocessor.fit_transform(X_train)
X_val_prep = preprocessor.transform(X_val)
X_test_prep = preprocessor.transform(X_test)

print("Train transformed shape:", X_train_prep.shape)
print("Validation transformed shape:", X_val_prep.shape)
print("Test transformed shape:", X_test_prep.shape)

Train transformed shape: (139999, 11)
Validation transformed shape: (30000, 11)
Test transformed shape: (30000, 11)


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced_subsample',
    n_jobs=-1
)
rf.fit(X_train_prep, y_train)

y_val_pred = rf.predict(X_val_prep)

print("Validation Results with RandomForest:")
print(classification_report(y_val, y_val_pred, digits=4))

Validation Results with RandomForest:
              precision    recall  f1-score   support

           0     0.9996    1.0000    0.9998     29957
           1     1.0000    0.7442    0.8533        43

    accuracy                         0.9996     30000
   macro avg     0.9998    0.8721    0.9266     30000
weighted avg     0.9996    0.9996    0.9996     30000



In [18]:
from sklearn.metrics import roc_auc_score

y_val_proba = rf.predict_proba(X_val_prep)[:, 1]
print("ROC AUC:", roc_auc_score(y_val, y_val_proba))

ROC AUC: 0.9876664304107204


In [30]:
def generate_random_transaction():
    # 80% normal, 20% suspicious (likely fraud)
    if np.random.rand() < 0.2:
        # suspicious transaction (high amount, unusual balances)
        amount = np.random.uniform(5000, 20000)
        oldbalanceOrg = np.random.uniform(0, 1000)
        newbalanceOrig = np.random.uniform(0, 1000)
    else:
        # normal transaction
        amount = np.random.uniform(10, 5000)
        oldbalanceOrg = np.random.uniform(0, 50000)
        newbalanceOrig = np.random.uniform(0, 50000)

    random_data = pd.DataFrame({
        'step': [np.random.randint(1, 744)],
        'type': [np.random.choice(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'])],
        'amount': [amount],
        'oldbalanceOrg': [oldbalanceOrg],
        'newbalanceOrig': [newbalanceOrig],
        'oldbalanceDest': [np.random.uniform(0, 100000)],
        'newbalanceDest': [np.random.uniform(0, 100000)]
    })

    return random_data


In [76]:
import numpy as np
# Generate and test a random transaction
random_data = generate_random_transaction()
print("Random input transaction:")
print(random_data)

# Preprocess and predict
random_data_prep = preprocessor.transform(random_data)
pred_label = rf.predict(random_data_prep)
pred_proba = rf.predict_proba(random_data_prep)[:, 1]

print("\nPredicted label:", pred_label[0])
print("Predicted probability of fraud:", pred_proba[0])

Random input transaction:
   step      type       amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0   494  TRANSFER  1558.274016   42804.865904    14382.122403    51909.901916   

   newbalanceDest  
0    95032.159572  

Predicted label: 0
Predicted probability of fraud: 0.005
