In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Ensure output directory for plots
os.makedirs('screenshots', exist_ok=True)

# Load data
try:
    df = pd.read_csv('../data/raw/data.csv')
except FileNotFoundError:
    print('Error: data/raw/data.csv not found. Please check the file path.')
    raise

# Initialize text output for insights
eda_insights = []

# 1. Summary Statistics
eda_insights.append('=== Summary Statistics ===')
eda_insights.append(str(df.describe(include='all')))

# 2. Missing Values
eda_insights.append('\n=== Missing Values ===')
missing_values = df.isnull().sum()
eda_insights.append(str(missing_values))
missing_percentage = (missing_values / len(df) * 100).round(2)
eda_insights.append('\nPercentage of Missing Values:')
eda_insights.append(str(missing_percentage))

# 3. Numerical Distribution: Amount
plt.figure(figsize=(10, 6))
sns.histplot(df['Amount'], bins=50, kde=True)
plt.title('Distribution of Transaction Amount')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.savefig('screenshots/amount_distribution.png')
plt.close()
eda_insights.append('\n=== Amount Distribution ===')
eda_insights.append(f'Mean Amount: {df["Amount"].mean():.2f}, Median Amount: {df["Amount"].median():.2f}')
eda_insights.append('The Amount column is highly skewed with a long right tail, indicating a few large transactions.')

# 4. Categorical Distribution: ProductCategory
plt.figure(figsize=(10, 6))
df['ProductCategory'].value_counts().plot(kind='bar')
plt.title('Product Category Distribution')
plt.xlabel('Product Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.savefig('screenshots/product_category.png')
plt.close()
eda_insights.append('\n=== Product Category Distribution ===')
eda_insights.append(f'Top 3 Categories: {df["ProductCategory"].value_counts().head(3).to_dict()}')
eda_insights.append(f'Number of unique categories: {df["ProductCategory"].nunique()}')

# 5. Correlation Matrix
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
if len(numeric_cols) > 1:
    plt.figure(figsize=(10, 6))
    sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix of Numeric Features')
    plt.savefig('screenshots/correlation_matrix.png')
    plt.close()
    eda_insights.append('\n=== Correlation Analysis ===')
    eda_insights.append('Key correlations:')
    corr_matrix = df[numeric_cols].corr()
    high_corr = corr_matrix.where((abs(corr_matrix) > 0.5) & (corr_matrix != 1.0)).stack()
    eda_insights.append(str(high_corr))
else:
    eda_insights.append('\n=== Correlation Analysis ===')
    eda_insights.append('Insufficient numeric columns for correlation analysis.')

# 6. Outliers: Amount
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['Amount'])
plt.title('Outliers in Transaction Amount')
plt.xlabel('Amount')
plt.savefig('screenshots/amount_outliers.png')
plt.close()
q1 = df['Amount'].quantile(0.25)
q3 = df['Amount'].quantile(0.75)
iqr = q3 - q1
outliers = df[(df['Amount'] < (q1 - 1.5 * iqr)) | (df['Amount'] > (q3 + 1.5 * iqr))]['Amount']
eda_insights.append('\n=== Outlier Analysis ===')
eda_insights.append(f'Number of outliers in Amount: {len(outliers)}')
eda_insights.append(f'Outlier range: {outliers.min():.2f} to {outliers.max():.2f}')

# 7. Transaction Time Analysis (if TransactionStartTime exists)
if 'TransactionStartTime' in df.columns:
    df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
    df['Hour'] = df['TransactionStartTime'].dt.hour
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Hour'], bins=24)
    plt.title('Transaction Time Distribution (Hour of Day)')
    plt.xlabel('Hour of Day')
    plt.ylabel('Frequency')
    plt.savefig('screenshots/transaction_time.png')
    plt.close()
    eda_insights.append('\n=== Transaction Time Analysis ===')
    eda_insights.append(f'Peak transaction hours: {df["Hour"].value_counts().head(3).index.tolist()}')

# Save insights to file
with open('screenshots/eda_insights.txt', 'w') as f:
    f.write('\n'.join(eda_insights))

# Print insights for sharing
print('\n'.join(eda_insights))


=== Summary Statistics ===
              TransactionId        BatchId       AccountId  \
count                 95662          95662           95662   
unique                95662          94809            3633   
top     TransactionId_76871  BatchId_67019  AccountId_4841   
freq                      1             28           30893   
mean                    NaN            NaN             NaN   
std                     NaN            NaN             NaN   
min                     NaN            NaN             NaN   
25%                     NaN            NaN             NaN   
50%                     NaN            NaN             NaN   
75%                     NaN            NaN             NaN   
max                     NaN            NaN             NaN   

             SubscriptionId       CustomerId CurrencyCode  CountryCode  \
count                 95662            95662        95662      95662.0   
unique                 3627             3742            1          NaN   
top   