In [None]:
from pathlib import Path

# --- 1. Core Data Handling & System Tools ---
# 'os' is often used for path construction (e.g., reading your CSV from the 'data' folder)
import os 
import numpy as np
import pandas as pd
import scipy as sp # Renaming to 'sp' for generic scientific functions

# For progress bars (tqdm is available)
from tqdm.notebook import tqdm 

# --- 2. Visualization Libraries ---
# All core visualization tools are available
import matplotlib.pyplot as plt
import seaborn as sns


# --- 3. Machine Learning & Modeling (scikit-learn is available) ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Metrics are crucial for classification problems like fraud detection
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# Import common models you might use (replace/add as needed)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 


# For model interpretability
import shap


# Define project root dynamically
PROJECT_ROOT = Path.cwd().parent if "notebooks" in Path.cwd().parts else Path.cwd()
DATA_PATH = PROJECT_ROOT / "data" / "fraud_detection_dataset.csv"

# Read CSV
fulldf = pd.read_csv(DATA_PATH)
fulldf

In [None]:
# Data Cleaning


# 1 - Check for Missing Values

null_vals = fulldf.isnull().sum()


# 2 - Date Standarisation 
fulldf['timestamp'] = pd.to_datetime(fulldf['timestamp'], errors='coerce')
int(fulldf['timestamp'].isna().sum()) #as the number is = 0 we can be certain that all dates have been standardised

# 3 - Validation of fraud label
fulldf['is_fraud'].value_counts() # result shows that there are 1,000,000 fraud and 1,000,000 not fraud transactions

# In addition all data types have been validated via the use of fulldf.info
fulldf.dtypes

# No duplicate transaction - validated by creating compound key with timestamp and user_id 
bool(fulldf[['user_id', 'timestamp']].duplicated().any())
