# https://www.kaggle.com/datasets/shriyashjagtap/fraudulent-e-commerce-transactions/code

In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px

In [60]:
fraud_path = r'F:\DataSpell\Pandas_training\Fraud\data_set\Fraudulent_E-Commerce_Transaction_Data.csv'

In [61]:
fraud = pd.read_csv(fraud_path, parse_dates=['Transaction Date']).dropna(how='all')

In [62]:
def is_fraudulent(row):
    if row == 1: return True
    else: return False

fraud['Is Fraudulent'] = fraud['Is Fraudulent'].apply(is_fraudulent)

In [63]:
fraud['Payment Method'] = fraud['Payment Method'].str.lower().astype('category')
fraud['Product Category'] = fraud['Product Category'].str.lower().astype('category')
fraud['Quantity'] = pd.to_numeric(fraud['Quantity'], downcast='integer')
fraud['Customer Age'] = pd.to_numeric(fraud['Customer Age'], downcast='integer')
fraud['Device Used'] = fraud['Device Used'].str.lower().astype('category')
fraud['Is Fraudulent'] = fraud['Is Fraudulent'].astype(bool)
fraud['Account Age Days'] = pd.to_numeric(fraud['Account Age Days'], downcast='integer')
fraud['Transaction Hour'] = pd.to_numeric(fraud['Transaction Hour'], downcast='integer')

In [64]:
fraud.head()

In [65]:
%%time 
# fraud.info() # 179.8+ MB # 102.5+ MB
fraud.nunique() # 2.29 s

In [66]:
fraud.info()

# 1. How many unique customers are there in the dataset?

In [67]:
fraud.head()

In [68]:
fraud['Customer ID'].nunique()

# 2. What is the total number of fraudulent transactions?

In [69]:
# fraud['Transaction Amount'].sum()
fraud[fraud['Is Fraudulent'] == 1].shape[0]

# 3. Calculate the total transaction amount for each payment method.

In [70]:
fraud

In [71]:
fraud.groupby('Payment Method', observed=True)['Transaction Amount'].sum().sort_values(ascending=False)

# 4. Identify the top 5 product categories by transaction amount.

In [72]:
fraud.groupby('Product Category')['Transaction Amount'].sum().sort_values(ascending=False)

# 5. What is the average customer age?

In [73]:
fraud['Customer Age'].mean()

# 6. How many unique customer locations are present?

In [74]:
fraud

In [75]:
fraud['Customer Location'].nunique()

# 7. Find the quantity of products sold per product category.

In [76]:
fraud.groupby('Product Category')['Quantity'].sum()

# 8. Determine the most common device used for transactions.

In [77]:
fraud.groupby('Device Used')['Transaction Amount'].count()

In [78]:
fraud['Device Used'].value_counts().idxmax()

# 9. What are the top 10 most common IP addresses?

In [79]:
fraud['IP Address'].value_counts(ascending=False).head(10)

# 10. Calculate the average transaction amount per day of the week.

In [80]:
# fraud['Transaction Date'].dt.dayofweek
day_name = fraud['Transaction Date'].dt.day_name()
fraud.groupby(day_name)['Transaction Amount'].mean()

# Data Cleaning

# 1. Check for missing values in the dataset.

In [81]:
fraud.isna().sum()

# 2. Identify any duplicate transactions, based on Transaction ID.

In [82]:
fraud['Transaction Hour'].duplicated(keep=False).sum()

# 3. Standardize the format of Transaction Date to a datetime format (YYYY-MM-DD HH:MM:SS).

In [83]:
fraud_date = fraud['Transaction Date'].dt.strftime('%Y-%m-%d %H:%M:%S')
fraud_date

# 4. Are there any anomalies in the Customer Age column (e.g., ages that are too high or too low)?

In [84]:
fraud['Customer Age'].describe()

In [85]:
# plt.figure(figsize=(20, 8))
# sns.boxplot(x=fraud['Customer Age'])
# plt.show()

In [86]:
# fig = px.box(fraud, y='Customer Age')
# fig.update_layout(autosize=False, width=1000, height=1000)
# fig.show()

# 5. Validate the format of IP addresses; identify any records with invalid IP address formats.

In [87]:
import ipaddress

def is_valid_ip(ip):
    try:
        ipaddress.ip_address(ip)
        return True
    except ValueError:
        return False

In [88]:
fraud_valid_ip = fraud['IP Address'].apply(is_valid_ip).astype(bool)
invalid_ips = fraud[~fraud_valid_ip]
invalid_ips

# 1. Data analysis

# What is the average transaction amount for fraudulent vs non-fraudulent transactions?

In [89]:
fraud

In [90]:
fraud_is = fraud[fraud['Is Fraudulent'] == True]
fraud_is.size

In [91]:
fraud_not = fraud[fraud['Is Fraudulent'] == False]
fraud_not.size

In [92]:
fraud.groupby('Is Fraudulent')['Transaction Amount'].mean()

# 2. Compare the average account age days for fraudulent and non-fraudulent transactions.

In [93]:
fraud.groupby('Is Fraudulent')['Account Age Days'].mean()

# 3. What time of day (broken down into morning, afternoon, evening, night) sees the highest average transaction amount?

In [94]:
hour = fraud['Transaction Date'].dt.hour

def time_of_day(row):
    if 6 <= row < 12:
        return 'Morning'
    elif 12 <= row < 17:
        return 'Afternoon'
    elif 17 <= row < 20:
        return 'Evening'
    else:
        return 'Night'
    
show_time_of_day = fraud['Transaction Date'].dt.hour.apply(time_of_day)

fraud.insert(4, 'Time of the Day', show_time_of_day)

In [95]:
# fraud.groupby('Time of the Day')['Transaction Amount'].agg(
#     mean_transaction='mean'
# )

fraud.groupby('Time of the Day').agg(
    {'Transaction Amount': 'mean'}
)

# 4. Is there a difference in the average transaction amount between weekdays and weekends?

In [96]:
def is_weekends(day):
    if day < 5: return 'Weekday'
    else: return 'Weekend'

days = fraud['Transaction Date'].dt.dayofweek + 1
weekday_weekend = fraud['Transaction Date'].dt.day.apply(is_weekends)

In [97]:
fraud.insert(4, 'Day of the Week', days)
# fraud.drop('Day of the Week', axis=1, inplace=True)
fraud.insert(5, 'Weekday or Weekend', weekday_weekend)
fraud

In [98]:
fraud.groupby('Weekday or Weekend')['Transaction Amount'].agg(
    ['mean', 'sum']
)

# 5. Analyze the relationship between customer age and the likelihood of a transaction being fraudulent.

In [99]:
fraud['Is Fraudulent'] = fraud['Is Fraudulent'].map({False: 0, True: 1})
average_age = fraud.groupby('Is Fraudulent')['Customer Age'].mean()

fraud[['Customer Age', 'Is Fraudulent']].corr()

In [100]:
fraud

# 6. Determine the payment method with the highest rate of fraudulent transactions.

In [101]:
fraud.groupby('Payment Method', observed=True)['Is Fraudulent'].mean().idxmax()

# 7. Identify the product category with the highest total revenue and calculate its share of total revenue.

In [102]:
total_revenue_per_category = fraud.groupby('Product Category')['Transaction Amount'].sum()
highest_revenue_category = total_revenue_per_category.idxmax()

total_revenue = fraud['Transaction Amount'].sum()
share_of_total_revenue = total_revenue_per_category[highest_revenue_category] / total_revenue
share_of_total_revenue

# 8. What are the top 3 locations by number of fraudulent transactions?

In [103]:
top_3_locations = fraud[fraud['Is Fraudulent'] == True].groupby('Customer Location').size().nlargest(3)
top_3_locations

# 9. Calculate the monthly total transaction amount and visualize the trend over time.

In [104]:
fraud.set_index('Transaction Date', inplace=True)
monthly_total = fraud.resample('ME')['Transaction Amount'].sum()
monthly_total

In [105]:
# monthly_total.plot(kind='line', title='Monthly Total Transaction Amount Over Time')
# plt.xlabel('Month')
# plt.ylabel('Total Transaction Amount')
# plt.show()

# 10. Determine the effect of quantity purchased on the likelihood of a transaction being fraudulent.

In [106]:
average_quantity = fraud.groupby('Is Fraudulent')['Quantity'].mean()
correlation = fraud[['Quantity', 'Is Fraudulent']].corr()
correlation

# Advanced Analysis

# 1. Perform a cohort analysis by customer sign-up month to see how transaction behavior changes over time.

In [107]:
transaction_month = fraud.index.to_period('M')
fraud.insert(5, 'Transaction Month', transaction_month)
fraud

In [112]:
fraud_cohort_month = fraud.groupby('Customer ID')['Transaction Month'].transform('min')
fraud.insert(6, 'Cohort Month', fraud_cohort_month)


In [116]:
fraud['Cohort Index']= (fraud['Transaction Month'] - fraud['Cohort Month']).apply(lambda x: x.n + 1)
cohort_data = fraud.groupby(['Cohort Month', 'Cohort Index'])['Customer ID'].nunique()
cohort_data = cohort_data.reset_index()

In [119]:
cohort_counts = cohort_data.pivot(index='Cohort Month', columns='Cohort Index', values='Customer ID')

In [120]:
cohort_sizes = cohort_counts.iloc[:,0]
retention = cohort_counts.divide(cohort_sizes, axis=0)

In [122]:
# plt.figure(figsize=(10, 8))
# sns.heatmap(data=retention, annot=True, fmt='.0%', vmin=0.0, vmax=0.5, cmap='BuGn')
# plt.title('Retention rates')
# plt.show()

# 2. Predict the likely fraudulent transactions based on transaction patterns (e.g., using a simple rule-based method).

In [123]:
# Calculate the 95th percentile of transaction amounts
threshold = fraud['Transaction Amount'].quantile(0.95)

# Identify the payment method with the highest rate of fraudulent transactions
fraudulent_payment_method = fraud.groupby('Payment Method', observed=True)['Is Fraudulent'].mean().idxmax()

# Define a function to flag likely fraudulent transactions
def flag_fraud(row):
    if (row['Transaction Amount'] > threshold) and (row['Payment Method'] == fraudulent_payment_method) and (row['Transaction Hour'] < 6 or row['Transaction Hour'] > 22):
        return True
    else:
        return False

# Apply the function to the DataFrame
fraud['Likely Fraudulent'] = fraud.apply(flag_fraud, axis=1)
fraud

# 3. Segment customers into groups based on their transaction behavior (e.g., using k-means clustering on features such as transaction amount, frequency).

In [127]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Prepare the data
data = fraud.groupby(fraud.index)['Transaction Amount'].mean().resample('M').count().to_frame()
data.columns = ['Transaction Frequency']

# Normalize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Determine the optimal number of clusters
wcss = []
for i in range(1, min(11, data.shape[0] + 1)):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(data_scaled)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, min(11, data.shape[0] + 1)), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Run the k-means algorithm with the optimal number of clusters
kmeans = KMeans(n_clusters=min(3, data.shape[0]), init='k-means++', max_iter=300, n_init=10, random_state=0)
clusters = kmeans.fit_predict(data_scaled)

# Add the cluster labels to the original data
data['Cluster'] = clusters

# Analyze the clusters
data.groupby('Cluster').mean()

# 4. For each product category, calculate the average transaction amount and use it to identify outliers (transactions which are significantly higher or lower than the average).

In [130]:
fraud

# 5. Create a heatmap to visualize the correlation between numerical features in the dataset.

In [133]:
# Select only numerical columns
numerical_columns = fraud.select_dtypes(include=[np.number])

# Calculate the correlation matrix
corr = numerical_columns.corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title('Correlation Matrix Heatmap')
plt.show()

# 6. Determine if there’s a significant difference in the average transaction amount between different devices used.

In [135]:
# Group by 'Device Used' and calculate the average 'Transaction Amount'
avg_transaction_by_device = fraud.groupby('Device Used')['Transaction Amount'].mean()

# Display the result
avg_transaction_by_device

# 7. Analyze the seasonality of transactions - does transaction volume or amount peak in certain months?

In [136]:
# Group by month and calculate the sum and count of 'Transaction Amount'
transaction_sum_by_month = fraud.resample('M')['Transaction Amount'].sum()
transaction_count_by_month = fraud.resample('M')['Transaction Amount'].count()

# Plot the transaction sum by month
transaction_sum_by_month.plot(kind='line', title='Monthly Total Transaction Amount')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.show()

# Plot the transaction count by month
transaction_count_by_month.plot(kind='line', title='Monthly Transaction Volume')
plt.xlabel('Month')
plt.ylabel('Transaction Volume')
plt.show()

# 8. Explore the impact of the account age on the transaction amount and frequency.

In [137]:

# Group by 'Account Age Days' and calculate the average 'Transaction Amount'
avg_transaction_by_age = fraud.groupby('Account Age Days')['Transaction Amount'].mean()

# Group by 'Account Age Days' and count the number of transactions
transaction_count_by_age = fraud.groupby('Account Age Days').size()

# Create a figure with two subplots
fig, ax = plt.subplots(2, 1, figsize=(10, 8))

# Plot the average transaction amount by account age
avg_transaction_by_age.plot(kind='line', ax=ax[0])
ax[0].set_title('Average Transaction Amount by Account Age')
ax[0].set_xlabel('Account Age Days')
ax[0].set_ylabel('Average Transaction Amount')

# Plot the transaction count by account age
transaction_count_by_age.plot(kind='line', ax=ax[1])
ax[1].set_title('Transaction Count by Account Age')
ax[1].set_xlabel('Account Age Days')
ax[1].set_ylabel('Transaction Count')

# Display the plots
plt.tight_layout()
plt.show()

# 9. Use pivot tables to analyze the relationship between product category, payment method, and fraudulence.

In [138]:
pivot_table = fraud.pivot_table(values='Is Fraudulent', index='Product Category', columns='Payment Method', aggfunc='mean')

# Display the pivot table
pivot_table

# 10. Create a scatter plot to visualize the relationship between transaction amount and customer age.

In [140]:
plt.figure(figsize=(10, 8))
plt.scatter(fraud['Customer Age'], fraud['Transaction Amount'])
plt.title('Relationship between Transaction Amount and Customer Age')
plt.xlabel('Customer Age')
plt.ylabel('Transaction Amount')
plt.show()

# Data Modeling (Hypothetical)

# 1. Build a linear regression model to predict the transaction amount based on other numerical features in the dataset. Evaluate its performance.

In [141]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Select numerical features
numerical_features = fraud.select_dtypes(include=[np.number])

# Exclude 'Transaction Amount' from the features
X = numerical_features.drop('Transaction Amount', axis=1)

# 'Transaction Amount' is our target
y = numerical_features['Transaction Amount']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model with the training data
model.fit(X_train, y_train)

# Predict the target for the testing data
y_pred = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

# 2. Construct a logistic regression model to predict whether a transaction is fraudulent based on features provided.


In [143]:
fraud

# 3. Use decision trees to understand the key factors that influence whether a transaction is fraudulent.

In [145]:
fraud

# 4. Apply random forest classifier to improve the prediction accuracy of fraudulent transactions.

# 5. Experiment with clustering techniques to identify patterns in customer transaction behavior that could indicate fraud.

In [149]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Select relevant features for clustering
features = fraud[['Transaction Amount', 'Customer Age', 'Account Age Days', 'Quantity']]

# Handle missing values if any
features = features.dropna()

# Normalize the data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Reduce dimensionality if necessary
pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_scaled)

# Determine the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(features_pca)
    wcss.append(kmeans.inertia_)

# Plot the WCSS to find the elbow
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Apply K-Means algorithm
optimal_clusters = 3  # replace this with the number of clusters at the elbow point
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
clusters = kmeans.fit_predict(features_pca)

# Add the cluster labels to the original data
fraud['Cluster'] = clusters

# Analyze the clusters
for i in range(optimal_clusters):
    print(f"Cluster {i}:")
    print(fraud[fraud['Cluster'] == i].describe())