In [None]:
# import the necessart dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load the dataset
df_cc = pd.read_csv('../data/raw/creditcard.csv')

## Checking the data out

In [None]:
df_cc.head() # check the top 5 data in the dataset

In [None]:
df_cc.info() # check the basic information about the dataset such as datatypes and non-null values

In [None]:
df_cc.describe() # Statistical summery of the dataset

## Data Cleaning

In [None]:
# handling missing values
df_cc.isnull().sum().any()

In [None]:
# Remove Duplicate values
df_cc.duplicated().sum()

In [None]:
df_cc.drop_duplicates(inplace=True)
df_cc.duplicated().sum()

## Exploratory Data Analysis

In [None]:
# Distribution of the target variable 'Class'
plt.figure(figsize=(6,4))
sns.countplot(x='Class', data=df_cc)
plt.title('Distribution of Fraudulent vs Non-Fraudulent Transactions')
plt.xlabel('Is Fraud')
plt.ylabel('Count')
plt.show()

In [None]:
# Distribution of 'Amount' variable
plt.figure(figsize=(8,6))
sns.histplot(df_cc['Amount'], bins=100, kde=True)
plt.title('Distribution of Transaction Amounts')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.xlim(0, 5000) # Limiting for better visualization
plt.show()

In [None]:
# Distribution of 'Time' variable
plt.figure(figsize=(8,6))
sns.histplot(df_cc['Time'], bins=100, kde=True)
plt.title('Distribution of Transaction Time')
plt.xlabel('Time (seconds)')
plt.ylabel('Frequency')
plt.show()

### Bivariate analysis

In [None]:
# relationship between 'Amount' and 'Class'
plt.figure(figsize=(8,6))
sns.boxplot(x='Class', y='Amount', data=df_cc)
plt.title('Transaction Amounts by Fraud Status')
plt.xlabel('Is Fraud')
plt.ylabel('Amount')
plt.ylim(0, 2000) # Limiting for better visualization
plt.show()

In [None]:
# Correlation matrix to see relationships between numerical variables
# Due to the large number of PCA components (V1-V28), we'll check correlations with 'Time', 'Amount', and 'Class'
plt.figure(figsize=(10,8))
correlation_matrix = df_cc[['Time', 'Amount', 'Class']].corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Time, Amount, and Class')
plt.show()

## Data Transformation
For the credit card dataset, the primary transformation needed is to scale the `Time` and `Amount` columns, as the `V` columns are already scaled.

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale 'Time' and 'Amount'
scaler = StandardScaler()
df_cc['scaled_amount'] = scaler.fit_transform(df_cc['Amount'].values.reshape(-1,1))
df_cc['scaled_time'] = scaler.fit_transform(df_cc['Time'].values.reshape(-1,1))

# Drop the original columns
df_cc.drop(['Time', 'Amount'], axis=1, inplace=True)

df_cc[['scaled_amount', 'scaled_time']].head()

## Handle Class Imbalance
Just like the fraud dataset, the credit card dataset is highly imbalanced. We will use SMOTE to address this.

In [None]:
# Separate features and target
X_cc = df_cc.drop('Class', axis=1)
y_cc = df_cc['Class']

# Apply SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_cc_resampled, y_cc_resampled = smote.fit_resample(X_cc, y_cc)

print("Class distribution before SMOTE:\n", y_cc.value_counts())
print("\nClass distribution after SMOTE:\n", pd.Series(y_cc_resampled).value_counts())