Dataset:

https://raw.githubusercontent.com/amankharwal/Website-data/master/marketing_campaign.csv

In [16]:
import numpy as np
import pandas as pd
import datetime
from datetime import date
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, normalize
from sklearn import metrics
from sklearn.mixture import GaussianMixture
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('marketing_campaign.csv', header=0, sep=';')
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


Now I will create some new features in the dataset to define the customer personalities as a part of data preparation:

In [17]:
#Spending variable creation
data['Age'] = 2023 - data['Year_Birth']

data['Spending'] = data['MntWines'] + data['MntFruits'] + data['MntMeatProducts'] + data['MntFishProducts'] + data['MntSweetProducts'] + data['MntGoldProds']

# Seniority variable creation
last_date = date(2023,8,30)
data['Seniority'] = pd.to_datetime(data['Dt_Customer'], dayfirst=True, format='%Y-%m-%d')
data['Seniority'] = pd.to_numeric(data['Seniority'].dt.date.apply(lambda x: (last_date - x)).dt.days, downcast='integer')/30
data = data.rename(columns={'NumWebPurchases':'Web','NumCatalogPurchases':'Catalog','NumStorePurchases':'Store'})
data['Marital_Status'] = data['Marital_Status'].replace({'Divorced':'Alone','Single':'Alone','Married':'In couple','Together':'In couple', 'Absurd':'Alone','Widow':'Alone','YOLO':'Alone'})
data['Education'] = data['Education'].replace({'Basic':'Undergraduate','2n Cycle':'Undergraduate','Graduation':'Postgraduate','Master':'Postgraduate','PHD':'Postgraduate'})

data['Children'] = data['Kidhome']+data['Teenhome']
data['Has_child'] = np.where(data.Children>0, 'Has child', 'No child')
data['Children'].replace({3: '3 children', 2:'2 children',1:'1 child', 0:'No child'}, inplace=True)
data = data.rename(columns={'MntWines':'Wines','MntFruits':'Fruits', 'MntMeatProducts':'Meat','MntFishProducts':'Fish','MntSweetProducts':'Sweets','MntGoldProds':'Gold'})

data = data[['Age','Education','Marital_Status','Income','Spending','Seniority','Has_child', 'Children','Wines', 'Fruits','Meat','Fish','Sweets','Gold']]
data.head()


Unnamed: 0,Age,Education,Marital_Status,Income,Spending,Seniority,Has_child,Children,Wines,Fruits,Meat,Fish,Sweets,Gold
0,66,Postgraduate,Alone,58138,1617,134,No child,No child,635,88,546,172,88,88
1,69,Postgraduate,Alone,46344,27,115,Has child,2 children,11,1,6,2,1,6
2,58,Postgraduate,In couple,71613,776,122,No child,No child,426,49,127,111,21,42
3,39,Postgraduate,In couple,26646,53,116,Has child,1 child,11,4,20,10,3,5
4,42,PhD,In couple,58293,422,117,Has child,1 child,173,43,118,46,27,15


In [18]:
data = data.dropna(subset=['Income'])
data = data[data['Income']<600000]

### Clustering

To take a look at the clustering of clients in the dataset, I’ll define the segments of the clients. Here we will use 4 equally weighted customer segments:

   1. Stars: Old customers with high income and high spending nature.
   2. Neet Attention: New customers with below-average income and low spending nature. 
   3. High Potential: New customers with high income and high spending nature.
   4. Leaky Bucket: Old customers with below-average income and a low spending nature.

In the code section below, I will first normalize the data and then I will create customer clustering according to the metrics defined above:

In [19]:
scaler = StandardScaler()
dataset_temp = data[['Income','Seniority','Spending']]
X_std = scaler.fit_transform(dataset_temp)
X = normalize(X_std,norm='l2')

gmm = GaussianMixture(n_components=4, covariance_type='spherical',max_iter=2000,random_state=5).fit(X)
labels = gmm.predict(X)
dataset_temp['Cluster'] = labels
dataset_temp = dataset_temp.replace({0:'Stars',1:'Need attention',2:'High potential',3:'Leaky bucket'})
data = data.merge(dataset_temp.Cluster, left_index=True, right_index=True)

pd.options.display.float_format = "{:.0f}".format
summary = data[['Income','Spending','Seniority','Cluster']]
summary.set_index("Cluster",inplace = True)
summary = summary.groupby('Cluster').describe().transpose()
summary.head()

Unnamed: 0,Cluster,High potential,Leaky bucket,Need attention,Stars
Income,count,584,641,528,462
Income,mean,34757,37705,69542,73438
Income,std,12075,12397,12006,13753
Income,min,2447,1730,44802,49090
Income,25%,26489,28839,60880,65298


Now let’s plot this data to have a look at the clustering of customers:

In [20]:
PLOT = go.Figure()
for C in list(data.Cluster.unique()):

    PLOT.add_trace(go.Scatter3d(x = data[data.Cluster == C]['Income'],
                                y = data[data.Cluster == C]['Seniority'],
                                z = data[data.Cluster == C]['Spending'],
                                mode = 'markers',marker_size = 6, marker_line_width = 1,
                                name = str(C)))

PLOT.update_traces(hovertemplate='Income: %{x} <br>Seniority: %{y} <br>Spending: %{z}')

PLOT.update_layout(width=800, height=800,autosize=True, showlegend = True,
                   scene = dict(xaxis=dict(title = 'Income',titlefont_color = 'black'),
                                yaxis=dict(title = 'Seniority', titlefont_color = 'black'),
                                zaxis=dict(title = 'Spending', titlefont_color = 'black')),
                    font = dict(family = 'Gilroy', color = 'black', size = 12))

### Data Preparation for Customer Personality Analysis

Now I will prepare the data for the Apriori algorithm. Here I will be defining three segments of the customers according to the age, income and seniority:

In [21]:
#Create Age segment
cut_labels_Age = ['Young','Adult','Mature','Senior']
cut_bins = [0, 30, 45, 65, 120]
data['Age_group'] = pd.cut(data['Age'], bins=cut_bins, labels=cut_labels_Age)
#Create Income segment
cut_labels_Income = ['Low income','Low to medium income', 'Medium to high income', 'High income']
data['Income_group'] = pd.qcut(data['Income'], q=4, labels=cut_labels_Income)
#Create Seniority segment
cut_labels_Seniority = ['New customers', 'Discovering customers','Experienced customers', 'Old customers']
data['Seniority_group'] = pd.qcut(data['Seniority'], q=4, labels=cut_labels_Seniority)
data = data.drop(columns=['Age','Income','Seniority'])

In [22]:
data.head()

Unnamed: 0,Education,Marital_Status,Spending,Has_child,Children,Wines,Fruits,Meat,Fish,Sweets,Gold,Cluster,Age_group,Income_group,Seniority_group
0,Postgraduate,Alone,1617,No child,No child,635,88,546,172,88,88,Need attention,Senior,Medium to high income,Old customers
1,Postgraduate,Alone,27,Has child,2 children,11,1,6,2,1,6,Leaky bucket,Senior,Low to medium income,New customers
2,Postgraduate,In couple,776,No child,No child,426,49,127,111,21,42,Stars,Mature,High income,Discovering customers
3,Postgraduate,In couple,53,Has child,1 child,11,4,20,10,3,5,Leaky bucket,Adult,Low income,New customers
4,PhD,In couple,422,Has child,1 child,173,43,118,46,27,15,Leaky bucket,Adult,Medium to high income,New customers


Now I will define new segments according to the spending of customers on each product which will be based on:

  1.  Non Buyer
  2.  Low Buyer
  3.  Frequent Buyer
  4.  Biggest Buyer

In [23]:
cut_labels = ['Lower consumer', 'Frequent consumer', 'Biggest consumer']
data['Wines_segment'] = pd.qcut(data['Wines'][data['Wines']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
data['Fruits_segment'] = pd.qcut(data['Fruits'][data['Fruits']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
data['Meat_segment'] = pd.qcut(data['Meat'][data['Meat']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
data['Fish_segment'] = pd.qcut(data['Fish'][data['Fish']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
data['Sweets_segment'] = pd.qcut(data['Sweets'][data['Sweets']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
data['Gold_segment'] = pd.qcut(data['Gold'][data['Gold']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
data.replace(np.nan, "Non consumer",inplace=True)
data.drop(columns=['Spending','Wines','Fruits','Meat','Fish','Sweets','Gold'],inplace=True)
data = data.astype(object)

### Ariori Algorithm

The Apriori algorithm is the simplest technique to identify the underlying relationships between different types of elements. The idea behind this algorithm is that all nonempty subsets of a frequent category must also be frequent. Here I will be using the Apriori algorithm for the task of customer personality analysis with Python. Here I will use this algorithm to identify the biggest customer of wines:

In [28]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 999)
pd.options.display.float_format = "{:.3f}".format
association=data.copy() 
df = pd.get_dummies(association)
min_support = 0.08
max_len = 10
frequent_items = apriori(df, use_colnames=True, min_support=min_support, max_len=max_len + 1)
rules = association_rules(frequent_items, metric='lift', min_threshold=1)

product='Wines'
segment='Biggest consumer'
target = '{\'%s_segment_%s\'}' %(product,segment)
results_personnal_care = rules[rules['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)
results_personnal_care.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
4215,"(Income_group_High income, Cluster_Need attention)",(Wines_segment_Biggest consumer),0.121,0.249,0.084,0.697,2.8,0.054,2.476,0.731
4209,"(Age_group_Mature, Cluster_Need attention)",(Wines_segment_Biggest consumer),0.127,0.249,0.088,0.694,2.79,0.056,2.455,0.735
4226,"(Cluster_Need attention, Seniority_group_Old customers)",(Wines_segment_Biggest consumer),0.123,0.249,0.085,0.691,2.779,0.054,2.433,0.73
4232,"(Cluster_Need attention, Meat_segment_Biggest consumer)",(Wines_segment_Biggest consumer),0.134,0.249,0.086,0.645,2.594,0.053,2.118,0.709
354,(Cluster_Need attention),(Wines_segment_Biggest consumer),0.238,0.249,0.151,0.633,2.543,0.091,2.045,0.797


Source:

https://thecleverprogrammer.com/2021/02/08/customer-personality-analysis-with-python/