### Task 2: Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/raw/data.csv')

# 1. Initial Overview
print(df.info())
print(df.describe())
print(df.nunique())

# 2. Data Quality Check
print("Missing values:\n", df.isnull().sum())

# 3. Transaction Amount Analysis
plt.figure(figsize=(12,6))
sns.histplot(df['Amount'], bins=50)
plt.title('Distribution of Transaction Amounts')
plt.show()

# 4. Fraud Analysis
fraud_rate = df['FraudResult'].mean()
print(f"Fraud rate: {fraud_rate:.2%}")

# 5. RFM Analysis (Potential Proxy Variable)
# Calculate Recency, Frequency, Monetary metrics per customer
rfm = df.groupby('AccountId').agg({
    'TransactionStartTime': lambda x: (df['TransactionStartTime'].max() - x.max()).days,
    'TransactionId': 'count',
    'Amount': ['sum', 'mean']
})

# 6. Correlation Analysis
corr_matrix = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True)
plt.title('Correlation Matrix')
plt.show()