In [138]:
import pandas as pd
import numpy as np
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

# Set the default template to 'plotly_dark' for all Plotly figures
pio.templates.default = 'plotly_dark'

import sys
import os
sys.path.append(os.path.abspath("../"))
from src.functions import *

### Step 1: Analyse high level details like shape, null values, data types and basic statistics

In [139]:
path = "../data/raw/customer_segmentation_data.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,age,gender,income,spending_score,membership_years,purchase_frequency,preferred_category,last_purchase_amount
0,1,38,Female,99342,90,3,24,Groceries,113.53
1,2,21,Female,78852,60,2,42,Sports,41.93
2,3,60,Female,126573,30,2,28,Clothing,424.36
3,4,40,Other,47099,74,9,5,Home & Garden,991.93
4,5,65,Female,140621,21,3,25,Electronics,347.08


In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    1000 non-null   int64  
 1   age                   1000 non-null   int64  
 2   gender                1000 non-null   object 
 3   income                1000 non-null   int64  
 4   spending_score        1000 non-null   int64  
 5   membership_years      1000 non-null   int64  
 6   purchase_frequency    1000 non-null   int64  
 7   preferred_category    1000 non-null   object 
 8   last_purchase_amount  1000 non-null   float64
dtypes: float64(1), int64(6), object(2)
memory usage: 70.4+ KB


In [141]:
df.id.nunique()

1000

In [142]:
df = df.set_index("id")
df.head()

Unnamed: 0_level_0,age,gender,income,spending_score,membership_years,purchase_frequency,preferred_category,last_purchase_amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,38,Female,99342,90,3,24,Groceries,113.53
2,21,Female,78852,60,2,42,Sports,41.93
3,60,Female,126573,30,2,28,Clothing,424.36
4,40,Other,47099,74,9,5,Home & Garden,991.93
5,65,Female,140621,21,3,25,Electronics,347.08


In [143]:
df.describe()

Unnamed: 0,age,income,spending_score,membership_years,purchase_frequency,last_purchase_amount
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,43.783,88500.8,50.685,5.469,26.596,492.34867
std,15.042213,34230.771122,28.955175,2.85573,14.243654,295.744253
min,18.0,30004.0,1.0,1.0,1.0,10.4
25%,30.0,57911.75,26.0,3.0,15.0,218.7625
50%,45.0,87845.5,50.0,5.0,27.0,491.595
75%,57.0,116110.25,76.0,8.0,39.0,747.17
max,69.0,149973.0,100.0,10.0,50.0,999.74


### Step 2: Categorize some features

In [148]:
# Set existing categories as the 'category' data type.
df['gender'] = df['gender'].astype('category')
df['preferred_category'] = df['preferred_category'].astype('category')

# Create buckets for different age ranges.
df['age_range'] = pd.cut(df['age'], [0, 20, 30, 40, 50, 60, 70])

# Create categories for low, medium and high income customers.
low = df['income'].quantile(.25)
medium = df['income'].quantile(.75)
high = df['income'].quantile(1)
df['income_level'] = pd.cut(df['income'], [0, low, medium, high], labels=['Low', 'Medium', 'High'])
df.income_level.value_counts()

df.dtypes

age                        int64
gender                  category
income                     int64
spending_score             int64
membership_years           int64
purchase_frequency         int64
preferred_category      category
last_purchase_amount     float64
age_range               category
income_level            category
dtype: object

### Step 2: Split the data into numerical and categorical dataframes

In [149]:
num_cols = df.select_dtypes("number").columns.tolist()
cat_cols = df.select_dtypes("object").columns.tolist()
cat = df.select_dtypes('category')

num_df = df[num_cols].copy()
cat_df = df[cat_cols].copy()

### Step 3: Visualize univariate data.

In [150]:
fig = make_subplots(rows=1, cols=len(cat.columns))

for i, col in enumerate(cat.columns):
    x = df[col].astype(str)
    y = df[col].value_counts()
    fig.add_trace(go.Bar(x=x.unique(), y=y, name=col), row=1, col=i+1)

fig.show()

In [147]:
fig = make_subplots(rows=1, cols=len(cat_df.columns))

for i, col in enumerate(cat_df.columns):
    x = df[col]
    fig.add_trace(go.Histogram(x=x, name=col), row=1, col=i+1)

fig.show()

ValueError: 
The 'cols' argument to make_subplots must be an int greater than 0.
    Received value of type <class 'int'>: 0

In [None]:
fig = make_subplots(rows=1, cols=len(num_df.columns))

for i, col in enumerate(num_df.columns):
    x = df[col]
    fig.add_trace(go.Histogram(x=x, name=col), row=1, col=i+1)

fig.show()

In [None]:
# Standardizing the DataFrame using scipy's zscore in order to view multiple box plots on the same scale.
num_df_standardized = num_df.apply(stats.zscore)

fig = px.box(num_df_standardized)
fig.show()

In [None]:
# The scipy skewtest / kurtosistest checks the skewness / kurtosis against that of a normal distribution. It returns the statistic (z-score) and p-value.
# the optional 'alternative' parameter accepts {‘two-sided’, ‘less’, ‘greater’}
#‘two-sided’: the skewtest / kurtosistest of the distribution underlying the sample is different from that of the normal distribution (i.e. 0)
#‘less’: the skewtest / kurtosistest of the distribution underlying the sample is less than that of the normal distribution
#‘greater’: the skewtest / kurtosistest of the distribution underlying the sample is greater than that of the normal distribution

# This line uses a custom function from src/functions.py
shape(num_df)

Unnamed: 0_level_0,skew,skew_pval,kurtosis,kurt_pval,shap_wilks_norm_pval,normaltest_pval
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
age,-0.045931,0.550488,-1.192708,6.315498e-153,3.767758e-17,1.747195e-151
income,0.050988,0.507532,-1.167468,3.886531e-127,1.974271e-16,9.396633000000001e-126
spending_score,-0.016552,0.829571,-1.215929,2.4871349999999997e-184,3.1094610000000005e-17,8.829289e-183
membership_years,0.029799,0.698418,-1.206557,1.461831e-170,2.969011e-20,4.737741e-169
purchase_frequency,-0.08384,0.276384,-1.130455,1.7579410000000001e-99,1.772838e-16,2.585242e-98
last_purchase_amount,0.017527,0.819704,-1.273612,0.0,9.70365e-19,0.0


### Step 4: Investigate bivariate data

In [None]:
pd.crosstab(df.gender, df.preferred_category)

preferred_category,Clothing,Electronics,Groceries,Home & Garden,Sports
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,56,65,66,68,61
Male,56,76,71,77,77
Other,58,74,62,61,72


In [None]:
num_df.corr()

Unnamed: 0,age,income,spending_score,membership_years,purchase_frequency,last_purchase_amount
age,1.0,-0.000688,0.017707,-0.003431,-0.030137,0.061599
income,-0.000688,1.0,2e-05,-0.035783,0.000533,-0.054006
spending_score,0.017707,2e-05,1.0,0.026726,0.006708,-0.014475
membership_years,-0.003431,-0.035783,0.026726,1.0,0.069532,-0.014135
purchase_frequency,-0.030137,0.000533,0.006708,0.069532,1.0,0.0244
last_purchase_amount,0.061599,-0.054006,-0.014475,-0.014135,0.0244,1.0


In [None]:
levene_kruskal = analyze_group_differences(df, 'preferred_category', num_cols)
levene_kruskal_df = pd.DataFrame(levene_kruskal).set_index('column')

# Set test threshold.
threshold = 0.05

levene_kruskal_df['levene_hypothesis'] = np.where(levene_kruskal_df['levene_pvalue'] > threshold, 'null', 'alternative')
levene_kruskal_df['kruskal_hypothesis'] = np.where(levene_kruskal_df['kruskal_pvalue'] > threshold, 'null', 'alternative')

levene_kruskal_df


Unnamed: 0_level_0,levene_statistic,levene_pvalue,kruskal_statistic,kruskal_pvalue,levene_hypothesis,kruskal_hypothesis
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
age,0.274134,0.894728,1.924813,0.749586,,
income,2.064232,0.083623,1.297917,0.861729,,
spending_score,1.067467,0.371435,0.601538,0.962893,,
membership_years,1.149464,0.331915,4.279576,0.36949,,
purchase_frequency,1.7689,0.133038,1.815715,0.769606,,
last_purchase_amount,0.503785,0.732978,1.85824,0.761812,,


In [None]:
levene_kruskal = analyze_group_differences(df, 'gender', num_cols)
levene_kruskal_df = pd.DataFrame(levene_kruskal).set_index('column')

# Set test threshold.
threshold = 0.05

levene_kruskal_df['levene_hypothesis'] = np.where(levene_kruskal_df['levene_pvalue'] > threshold, 'null', 'alternative')
levene_kruskal_df['kruskal_hypothesis'] = np.where(levene_kruskal_df['kruskal_pvalue'] > threshold, 'null', 'alternative')

levene_kruskal_df


Unnamed: 0_level_0,levene_statistic,levene_pvalue,kruskal_statistic,kruskal_pvalue,levene_hypothesis,kruskal_hypothesis
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
age,0.198883,0.81968,5.10875,0.077741,,
income,4.249651,0.014543,0.124708,0.93955,alternative,
spending_score,0.016435,0.983699,2.176718,0.336769,,
membership_years,0.097959,0.906696,1.687392,0.430118,,
purchase_frequency,1.786107,0.168177,5.754787,0.056281,,
last_purchase_amount,2.700623,0.067682,6.511215,0.038557,,alternative


In [None]:
sample = np.random.choice(df.index, size=1000)
sample_data = df.loc[sample]

fig = px.scatter_matrix(
    sample_data,
    dimensions=['purchase_frequency', 'last_purchase_amount', 'income'],
    color="gender"
    )

fig.show()

In [None]:
levene_kruskal = analyze_group_differences(df, 'age_range', num_cols)
levene_kruskal_df = pd.DataFrame(levene_kruskal).set_index('column')

# Set test threshold.
threshold = 0.05

levene_kruskal_df['levene_hypothesis'] = np.where(levene_kruskal_df['levene_pvalue'] > threshold, 'null', 'alternative')
levene_kruskal_df['kruskal_hypothesis'] = np.where(levene_kruskal_df['kruskal_pvalue'] > threshold, 'null', 'alternative')

levene_kruskal_df





Unnamed: 0_level_0,levene_statistic,levene_pvalue,kruskal_statistic,kruskal_pvalue,levene_hypothesis,kruskal_hypothesis
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
age,18.249794,3.768311e-16,355.235505,1.305708e-74,alternative,alternative
income,1.374941,0.2329566,2.877662,0.71884,,
spending_score,0.421673,0.8335887,7.934197,0.1598997,,
membership_years,1.278624,0.2724519,3.693891,0.5942733,,
purchase_frequency,0.312125,0.9056007,3.266358,0.658996,,
last_purchase_amount,1.024791,0.4027265,5.393837,0.3697276,,
