Since purchase_frequency and spending are highly correlated, we will analyze this class the most and remove outliers accordingly. I refer to the purchase_frequency_plots in determining normal distributions

In [1]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport as report
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [8]:
df = pd.read_csv('customer_data.csv').drop('name', axis=1)

In [16]:
result = df.groupby('purchase_frequency')['spending'].agg(['mean', 'std'])
result

Unnamed: 0_level_0,mean,std
purchase_frequency,Unnamed: 1_level_1,Unnamed: 2_level_1
0.1,1780.840321,476.276093
0.2,3382.138481,836.719128
0.3,5103.606563,1062.00295
0.4,7026.963673,1646.964385
0.5,8672.028465,2191.612031
0.6,10382.204587,2520.899515
0.7,12328.151176,2749.214141
0.8,14051.709691,2759.070344
0.9,15457.689607,3225.984968
1.0,17088.080097,4199.229497


In [20]:
grouped_df = df.groupby('purchase_frequency')

In [35]:
plt.switch_backend('agg')

# Create 10 subplots
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15, 6))
axes = axes.flatten()

# Iterate over each purchase frequency group
for i, (group, data) in enumerate(grouped_df):
    # Plot the data
    ax = axes[i]
    ax.hist(data['spending'], bins=20)
    ax.set_title('Purchase Frequency: {}'.format(group))
    ax.set_xlabel('Spending')
    ax.set_ylabel('Count')

# Adjust the layout
plt.tight_layout()

# Save the figure instead of showing it
plt.savefig('purchase_frequency_plots.png')

Try log transform and power transform in order to turn left-skewed data into normal distribution

In [32]:
# log transform

log = df.copy()
log['spending'] = np.log(log['spending'])

grouped_log = log.groupby('purchase_frequency')

In [30]:
plt.switch_backend('agg')

# Create 10 subplots
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15, 6))
axes = axes.flatten()

# Iterate over each purchase frequency group
for i, (group, data) in enumerate(grouped_log):
    # Plot the data
    ax = axes[i]
    ax.hist(data['spending'], bins=20)
    ax.set_title('Purchase Frequency: {}'.format(group))
    ax.set_xlabel('Spending')
    ax.set_ylabel('Count')

# Adjust the layout
plt.tight_layout()

# Save the figure instead of showing it
plt.savefig('log_transform_pf_plot.png')

In [33]:
# Power transform

power = df.copy()
power['spending'] = np.sqrt(power['spending'])

grouped_pow = power.groupby('purchase_frequency')

In [34]:
plt.switch_backend('agg')

# Create 10 subplots
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15, 6))
axes = axes.flatten()

# Iterate over each purchase frequency group
for i, (group, data) in enumerate(grouped_pow):
    # Plot the data
    ax = axes[i]
    ax.hist(data['spending'], bins=20)
    ax.set_title('Purchase Frequency: {}'.format(group))
    ax.set_xlabel('Spending')
    ax.set_ylabel('Count')

# Adjust the layout
plt.tight_layout()

# Save the figure instead of showing it
plt.savefig('power_transform_pf_plot.png')