# Imports

In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Data

In [42]:
data = pd.read_csv('./Fraudulent_E-Commerce_Transaction_Data_2.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

# Exploratory Data Analysis

In [47]:
# Change dtypes of categorical columns
data['Transaction Date'] = pd.to_datetime(data['Transaction Date'])
data['Payment Method'] = data['Payment Method'].astype('category')
data['Product Category'] = data['Product Category'].astype('category')
data['Device Used'] = data['Device Used'].astype('category')


In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

In [None]:
# Visualize the distribution of the target variable
fraud_counts = data['Is Fraudulent'].value_counts()
print(fraud_counts)
sns.barplot(x=fraud_counts.index, y=fraud_counts.values)
plt.title("Fraud vs Non-Fraud Distribution")
plt.show()
print(f"Fraud Ratio: {data['Is Fraudulent'].mean() * 100}%")


In [None]:
# Visualize transaction amounts for fraudulent vs non-fraudulent transactions
sns.boxplot(x='Is Fraudulent', y='Transaction Amount', data=data)
plt.title("Transaction Amount by Fraud Status")
plt.show()

In [None]:
# Visualize the distribution of transaction amounts
plt.figure(figsize=(8, 6))
sns.histplot(data['Transaction Amount'], kde=True)
plt.xlabel('Transaction Amount')
plt.ylabel('Count')
plt.title('Distribution of Transaction Amount')
plt.show()

In [None]:
# Analyze Temporal Patterns
data['Transaction Hour'] = data['Transaction Date'].dt.hour
data['Transaction Day'] = data['Transaction Date'].dt.day

sns.histplot(data=data, x='Transaction Hour', hue='Is Fraudulent', multiple='stack')
plt.title("Fraud Distribution by Hour of Day")
plt.show()

sns.countplot(data=data, x='Transaction Day', hue='Is Fraudulent')
plt.title("Fraud by Day of Week")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Analyze fraud rates across categorical features
categorical_cols = ['Payment Method', 'Product Category', 'Device Used']

for col in categorical_cols:
    fraud_rate = data.groupby(col)['Is Fraudulent'].mean().sort_values(ascending=False)
    plt.figure(figsize=(8,4))
    sns.barplot(x=fraud_rate.index, y=fraud_rate.values)
    plt.title(f"Fraud Rate by {col}")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# Customer demographics analysis
sns.boxplot(x='Is Fraudulent', y='Account Age Days', data=data)
plt.title("Account Age vs Fraudulent Activity")
plt.show()

sns.boxplot(x='Is Fraudulent', y='Customer Age', data=data)
plt.title("Customer Age vs Fraudulent Activity")
plt.show()


In [None]:
data['Address Mismatch'] = (data['Shipping Address'] != data['Billing Address']).astype(int)
sns.barplot(x='Address Mismatch', y='Is Fraudulent', data=data)
plt.title("Fraud Rate by Address Mismatch")
plt.show()

In [None]:
# Correlation analysis
numeric_cols = ['Transaction Amount', 'Quantity', 'Customer Age', 'Account Age Days', 'Transaction Hour']
corr = data[numeric_cols + ['Is Fraudulent']].corr()

sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
