In [None]:
# pip install ydata_profiling

# Reading data and make them ready to work

In [None]:
import pandas as pd
# from ydata_profiling import ProfileReport

donor_df = pd.read_csv("BenefactorsData.csv")
# donor_profile = ProfileReport(donor_df, title = "Donors Data")
# donor_profile.to_file("Donors Data Profile.html")

payment_df = pd.read_csv("TransactionalData.csv")
# payment_profile = ProfileReport(payment_df, title = "Payments Transactions")
# payment_profile.to_file("Payments Data Profile.html")

In [None]:
donor_df.info()
payment_df.info()

Merge datasets

In [None]:
merged_df = pd.merge(donor_df, payment_df, on="UserID")
merged_df.to_csv("merged_dataset.csv")

Add summary data to Donor Dataset

In [None]:
# Calculate the number of records and sum of payments for each UserID in the Payment dataset
payment_summary = payment_df.groupby('UserID').agg(
    NumPayments=('TransID', 'count'),
    TotalPayment=('PaymentAmount', 'sum')
).reset_index()

# Merge the summary back into the Donor dataset
donor_df = donor_df.merge(payment_summary, on='UserID', how='left')
#print(donor_df)

# Check the dependency between Gender and number of payments and total payments

In [None]:
# Find the 10 greatest values for NumPayments
top_num_payments = donor_df.nlargest(20, 'NumPayments')
print("Top 10 NumPayments:")
print(top_num_payments)

# Find the 10 greatest values for TotalPayment
top_total_payments = donor_df.nlargest(20, 'TotalPayment')
print("\nTop 10 TotalPayment:")
print(top_total_payments)

In [None]:
from scipy.stats import ttest_ind
import numpy as np

# Remove nan values
donor_df = donor_df.dropna(subset=['Gender', 'NumPayments', 'TotalPayment'])

# Coding gender values
donor_df['GenderNumeric'] = donor_df['Gender'].map({'M': 0, 'F': 1})

print(donor_df)


male_data = donor_df[donor_df['GenderNumeric'] == 0]
female_data = donor_df[donor_df['GenderNumeric'] == 1]

# Perform t-tests for NumPayments and TotalPayment
num_payments_test = ttest_ind(male_data['NumPayments'], female_data['NumPayments'], equal_var=False)
total_payment_test = ttest_ind(male_data['TotalPayment'], female_data['TotalPayment'], equal_var=False)

# Print results
print("T-test for NumPayments (Gender Dependency):")
print(f"Statistic: {num_payments_test.statistic}, p-value: {num_payments_test.pvalue}")

print("\nT-test for TotalPayment (Gender Dependency):")
print(f"Statistic: {total_payment_test.statistic}, p-value: {total_payment_test.pvalue}")

# Result
# T-test for NumPayments (Gender Dependency):
# Statistic: 20.404277170355236, p-value: 1.8268465392917694e-92
# T-test for TotalPayment (Gender Dependency):
# Statistic: 4.543933993934602, p-value: 5.524716571142883e-06

# There is a significant relationship between gender and NumPayments
# There is a significant relationship between gender and TotalPayment

# Visualize the result

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the plotting style
#sns.set(style="whitegrid")

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Boxplot for NumPayments by Gender
sns.boxplot(data=donor_df, x='Gender', y='NumPayments', ax=axes[0])
axes[0].set_title("NumPayments by Gender")
axes[0].set_xlabel("Gender")
axes[0].set_ylabel("NumPayments")

# Boxplot for TotalPayment by Gender
sns.boxplot(data=donor_df, x='Gender', y='TotalPayment', ax=axes[1])
axes[1].set_title("TotalPayment by Gender")
axes[1].set_xlabel("Gender")
axes[1].set_ylabel("TotalPayment")

# Show the plots
#plt.tight_layout()
plt.show()

In [None]:
# Group by Gender and calculate the mean of NumPayments and TotalPayment
group_stats = donor_df.groupby('Gender').agg({
    'NumPayments': 'mean',
    'TotalPayment': 'mean'
}).reset_index()

print(group_stats)