In [2]:
import os
path = os.path.join('..','..','wheels')

for i in os.listdir(path):
    print(i)

absl_py-2.3.0-py3-none-any.whl
annotated_types-0.7.0-py3-none-any.whl
anyio-4.9.0-py3-none-any.whl
astunparse-1.6.3-py2.py3-none-any.whl
certifi-2025.6.15-py3-none-any.whl
charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl
click-8.2.1-py3-none-any.whl
colorama-0.4.6-py2.py3-none-any.whl
fastapi-0.115.13-py3-none-any.whl
flatbuffers-25.2.10-py2.py3-none-any.whl
gast-0.6.0-py3-none-any.whl
google_pasta-0.2.0-py3-none-any.whl
grpcio-1.73.0-cp311-cp311-win_amd64.whl
h11-0.16.0-py3-none-any.whl
h5py-3.14.0-cp311-cp311-win_amd64.whl
idna-3.10-py3-none-any.whl
keras-3.10.0-py3-none-any.whl
libclang-18.1.1-py2.py3-none-win_amd64.whl
markdown-3.8.2-py3-none-any.whl
markdown_it_py-3.0.0-py3-none-any.whl
MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl
mdurl-0.1.2-py3-none-any.whl
ml_dtypes-0.5.1-cp311-cp311-win_amd64.whl
namex-0.1.0-py3-none-any.whl
numpy-2.1.3-cp311-cp311-win_amd64.whl
optree-0.16.0-cp311-cp311-win_amd64.whl
opt_einsum-3.4.0-py3-none-any.whl
packaging-25.0-py3-none-any.whl
pillow-11.

In [1]:
import os
import sys
import mlflow
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.config.paths import get_path
paths = get_path()


path = os.path.join(paths.DATA_PATH,'creditcard.csv')
df = pd.read_csv(path)

df['Hour'] = (df['Time'] // 3600) % 24
df['Day'] = df['Time'] // (3600 * 24)
df['LogAmount'] = np.log1p(df['Amount'])
fraud_per = df['Class'].mean() * 100


print(f'Percentage of Frauds: {fraud_per:.2f}')
print(f'Missing Values per Column:\n',df.isnull().sum())
print(f'Statistical Summary of Data:\n',df.describe())
print(f'DataFrame Information:\n')
df.info()


sns.histplot(data=df,x='LogAmount')
plt.title('Histogram of LogAmount By Number of Transactions')
plt.show()

plt.figure(figsize=(12,6))
sns.histplot(data=df[df['Class'] == 0],x='Hour',bins=24,color='green',kde=True,label='Legit')
sns.histplot(data=df[df['Class'] == 1],x='Hour',bins=24,color='red',kde=True,label='Fraud')
plt.title('Legit / Fraud Transactions By Hour')
plt.legend()
plt.grid()

plt.figure(figsize=(12,5))
sns.heatmap(data=df.corr(),annot=False)
plt.title('Correlational Heatmap of dataframe')

plt.show()

plt.figure(figsize=(12,5))
sns.boxplot(data=df,x='Class',y='Amount')
plt.yscale('log')
plt.title('Transaction Amount by Class')
plt.show()




X = df.drop(columns=['Time','Class'])
y = df['Class']

scaler = StandardScaler()
X_encoded = scaler.fit_transform(X)


pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_encoded)

df_pca = pd.DataFrame(data=X_pca,columns=['PCA1','PCA2'])
df_pca['Class'] = y.values

fig = px.scatter(df_pca,x='PCA1',y='PCA2',color='Class',title='PCA Projection (Fruad/Legit)')

if 'ipynbkernel' in sys.modules:
    fig.show()
else:
    fig.write_html(os.path.join(paths.REPORT_PATH,'pca_projection.html'))
    mlflow.log_artifact(os.path.join(paths.REPORT_PATH,'pca_projection.html'))


KeyboardInterrupt: 