In [6]:
import os

def is_skipped_path(path: str, skip_dirs: list[str]) -> bool:
    """Check if the path should be skipped."""
    for skip in skip_dirs:
        if skip in path:
            return True
    return False

def collect_files(base_dir: str, skip_dirs: list[str]) -> list[str]:
    """Recursively collect all files under base_dir, skipping specific folders."""
    collected_files = []

    for root, dirs, files in os.walk(base_dir, topdown=True):
        # Modify dirs in-place to skip traversal into unwanted dirs
        dirs[:] = [d for d in dirs if not is_skipped_path(os.path.join(root, d), skip_dirs)]

        for file in files:
            full_path = os.path.join(root, file)
            if not is_skipped_path(full_path, skip_dirs):
                collected_files.append(full_path)

    return collected_files

# Set base directory (parent of current working dir)
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Directories to skip
skip_dirs = [
    os.path.join(base_dir, 'src', '__pycache__'),
    os.path.join(base_dir, '.git'),
]

# Get all remaining files
all_files = collect_files(base_dir, skip_dirs)

# Print or process files
for f in all_files:
    print(f)


f:\.vscode\Projects\p3\.dockerignore
f:\.vscode\Projects\p3\.env
f:\.vscode\Projects\p3\Dockerfile
f:\.vscode\Projects\p3\LICENSE
f:\.vscode\Projects\p3\main.py
f:\.vscode\Projects\p3\README.md
f:\.vscode\Projects\p3\requirements.txt
f:\.vscode\Projects\p3\setup.py
f:\.vscode\Projects\p3\template.py
f:\.vscode\Projects\p3\data\creditcard.csv
f:\.vscode\Projects\p3\data\dataset-metadata.json
f:\.vscode\Projects\p3\Fraud_Detection.egg-info\dependency_links.txt
f:\.vscode\Projects\p3\Fraud_Detection.egg-info\PKG-INFO
f:\.vscode\Projects\p3\Fraud_Detection.egg-info\SOURCES.txt
f:\.vscode\Projects\p3\Fraud_Detection.egg-info\top_level.txt
f:\.vscode\Projects\p3\models\__init__.py
f:\.vscode\Projects\p3\models\cat\__init__.py
f:\.vscode\Projects\p3\models\optuna\__init__.py
f:\.vscode\Projects\p3\models\voting\__init__.py
f:\.vscode\Projects\p3\models\xgb\__init__.py
f:\.vscode\Projects\p3\notebooks\01_eda.ipynb
f:\.vscode\Projects\p3\notebooks\__init__.py
f:\.vscode\Projects\p3\src\data_pre

In [None]:
import os
import sys
import mlflow
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.config.paths import get_path
paths = get_path()


path = os.path.join(paths.DATA_PATH,'creditcard.csv')
df = pd.read_csv(path)

df['Hour'] = (df['Time'] // 3600) % 24
df['Day'] = df['Time'] // (3600 * 24)
df['LogAmount'] = np.log1p(df['Amount'])
fraud_per = df['Class'].mean() * 100


print(f'Percentage of Frauds: {fraud_per:.2f}')
print(f'Missing Values per Column:\n',df.isnull().sum())
print(f'Statistical Summary of Data:\n',df.describe())
print(f'DataFrame Information:\n')
df.info()


sns.histplot(data=df,x='LogAmount')
plt.title('Histogram of LogAmount By Number of Transactions')
plt.show()

plt.figure(figsize=(12,6))
sns.histplot(data=df[df['Class'] == 0],x='Hour',bins=24,color='green',kde=True,label='Legit')
sns.histplot(data=df[df['Class'] == 1],x='Hour',bins=24,color='red',kde=True,label='Fraud')
plt.title('Legit / Fraud Transactions By Hour')
plt.legend()
plt.grid()

plt.figure(figsize=(12,5))
sns.heatmap(data=df.corr(),annot=False)
plt.title('Correlational Heatmap of dataframe')

plt.show()

plt.figure(figsize=(12,5))
sns.boxplot(data=df,x='Class',y='Amount')
plt.yscale('log')
plt.title('Transaction Amount by Class')
plt.show()




X = df.drop(columns=['Time','Class'])
y = df['Class']

scaler = StandardScaler()
X_encoded = scaler.fit_transform(X)


pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_encoded)

df_pca = pd.DataFrame(data=X_pca,columns=['PCA1','PCA2'])
df_pca['Class'] = y.values

fig = px.scatter(df_pca,x='PCA1',y='PCA2',color='Class',title='PCA Projection (Fruad/Legit)')

if 'ipynbkernel' in sys.modules:
    fig.show()
else:
    fig.write_html(os.path.join(paths.REPORT_PATH,'pca_projection.html'))
    mlflow.log_artifact(os.path.join(paths.REPORT_PATH,'pca_projection.html'))
