# Predict Customer Personality to Boost Marketing Campaign by Using Machine Learning

## Task 1 : Conversion Rate Analysis Based On Income, Spending And Age
Goals : Find a pattern of consumer behavior.<br>
Objective : 
- Feature engineering 
- Analyze Conversion Rate with other variables such as age, income, expenses, etc 

### Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.decomposition import PCA
randomstate=511

### Load Data

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('./data/marketing_campaign_data.csv')
df.sample(10)

In [None]:
df.info()

### Feature Engineering
New Features :
- Age                = age for each customer
- AgeGroup           = age group for better interpretation in analysis ahead
- Parent             = is the customer have kid or not
- NumChild           = how many child do the customer have?
- TotalAcceptedCmp   = How many campaigns does the customer receive after the campaign is carried out?
- Total Trx          = How many transaction the customer do in our store?
- Online Trx         = How many online transaction the customer generate on our platform? 
- ConversionRate     = the percentage of website visitors who complete a web purchase

In [None]:
# make a copy of df for feature engineering
dfe = df.copy()

# new column age
dfe['Age'] = 2024 - dfe['Year_Birth']

# new column age group
age_grouping = [
    (dfe['Age'] >= 60),
    (dfe['Age'] >= 40 ) & (dfe['Age'] < 60),
    (dfe['Age'] >= 28) & (dfe['Age'] < 40)
]
age_category = ['Old Adults', 'Middled-aged Adults', 'Young Adults']
dfe['AgeGroup'] = np.select(age_grouping, age_category)

# new column HasKid
def has_kid(row):
    if row['Kidhome'] > 0 or row['Teenhome'] > 0:
        return 'yes'
    else:
        return 'no'
dfe['Parent'] = dfe.apply(has_kid, axis=1)

# Num child column
dfe['NumChild'] = dfe['Kidhome'] + dfe['Teenhome']

# new column TotalAcceptedCmp
dfe['TotalAcceptedCmp'] = dfe['AcceptedCmp1'] + dfe['AcceptedCmp2'] + dfe['AcceptedCmp3'] + dfe['AcceptedCmp4'] + dfe['AcceptedCmp5']

# new column TotalSpending
dfe['TotalSpending'] = dfe['MntCoke'] + dfe['MntFruits'] + dfe['MntMeatProducts'] + dfe['MntFishProducts'] + dfe['MntSweetProducts'] + dfe['MntGoldProds']

# Total Transaction column
dfe['TotalTrx'] = dfe['NumDealsPurchases'] + dfe['NumWebPurchases'] + dfe['NumCatalogPurchases'] + dfe['NumStorePurchases']

# ConversionRate column
dfe['ConversionRate'] =  dfe['NumWebPurchases'] / dfe['NumWebVisitsMonth']

In [None]:
dfe[['Education', 'Marital_Status', 'Income','Recency','NumWebVisitsMonth',
       'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'AgeGroup', 'Parent', 'NumChild', 'TotalAcceptedCmp',
       'TotalSpending', 'TotalTrx', 'ConversionRate']].sample(10)

In [None]:
dfe.describe()

In [None]:
dfe[dfe.ConversionRate.isna()]

### EDA

In [None]:
plt.figure(figsize=(12,8), facecolor='#E8E8E8')
sns.scatterplot(x='Income', y='ConversionRate', data=dfe, color='#D1106F')

plt.xlim(0, 200000000)
plt.ylim(0, 4.7)

plt.axvline(x=110000000, color='b', linestyle='--') 

plt.title("Customer Conversion Rate and Income Correlation", fontsize=19, fontweight='bold', y=1.02)
plt.xlabel('Income', fontsize=13.5)
plt.ylabel('Conversion Rate', fontsize=13.5)


In [None]:
plt.figure(figsize=(12,8), facecolor='#E8E8E8')
sns.scatterplot(x='TotalSpending', y='Income', data=dfe, color='#D1106F')
plt.ylim(0, 122000000)
plt.xlim(0, 2700000)
plt.axvline(x=2540000, color='b', linestyle='--') # Vertical line at x=100000000
plt.title('Customer Income and Total Spending Correlation', fontsize=17, fontweight='bold', y=1.03)
plt.xlabel('Total Spending', fontsize=13.5)
plt.ylabel('Income', fontsize=13.5)

In [None]:
plt.figure(figsize=(12,8), facecolor='#E8E8E8')
sns.scatterplot(x='TotalSpending', y='ConversionRate', data=dfe, color='#D1106F')
plt.ylim(0, 3.8)
plt.title('Correlation Between Conversion Rate and Total Spending', fontsize=18, fontweight='bold', y=1.02)
plt.xlabel('Total Spending', fontsize=13.5)
plt.ylabel('Conversion Rate', fontsize=13.5)

In [None]:
# Get counts of each age group
age_counts = dfe['AgeGroup'].value_counts()
palt = ['#00D19B','#D1106F' ,'#25A9D9']

# Create pie chart
plt.figure(figsize=(12, 8), facecolor='#E8E8E8')
patches, texts, autotexts = plt.pie(age_counts, colors=palt, autopct='%1.1f%%', textprops={'size': 13})

# Legend
plt.legend(patches, age_counts.index, loc="best")

plt.title("Distribution of Customer by Age Group", fontsize=18, fontweight='bold', y=1.03)
plt.show()

In [None]:
# Get counts of each age group
parent_counts = dfe['Parent'].value_counts()
palt = ['#00D19B','#D1106F']

# Create pie chart
plt.figure(figsize=(12, 8), facecolor='#E8E8E8')
patches, texts, autotexts = plt.pie(parent_counts, colors=palt, autopct='%1.1f%%', textprops={'size':13})

# Add legend
plt.legend(patches, parent_counts.index, loc="best")

plt.title("Parent Customer Distribution", fontsize=18, fontweight='bold', y=1.02, x=0.54)
plt.show()

In [None]:

plt.figure(figsize=(10, 8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9']
age_order = ['Young Adults', 'Middled-aged Adults', 'Old Adults']
barplot = sns.barplot(data=dfe, x='AgeGroup', y='ConversionRate',hue='AgeGroup', order=age_order, legend=False, palette=palt, errorbar=None, edgecolor='black')
# Add annotations
for p in barplot.patches:
    height = p.get_height()
    barplot.text(p.get_x()+p.get_width()/2.,
            height + 0.01,
            '{:1.2f}'.format(height),
            ha="center",
            fontweight='bold') 

plt.ylim(0, 1.5)
plt.title("Conversion Rate by Age Group", fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)

In [None]:
plt.figure(figsize=(10, 8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9']
age_order = ['Young Adults', 'Middled-aged Adults', 'Old Adults']
barplot = sns.barplot(data=dfe, x='AgeGroup', y='TotalSpending',hue='AgeGroup', order=age_order, legend=False, palette=palt, errorbar=None, edgecolor='black')

# Adding annotations
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 10), 
                   textcoords = 'offset points',
                   fontweight='bold')

plt.ylim(0, 820000)
plt.title("Total Spending by Age Group", fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Age Group', fontsize=13)
plt.ylabel('Total Spending', fontsize=13)

In [None]:
plt.figure(figsize=(10, 8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9']
age_order = ['Young Adults', 'Middled-aged Adults', 'Old Adults']
barplot = sns.barplot(data=dfe, x='AgeGroup', y='TotalAcceptedCmp',hue='AgeGroup', order=age_order, legend=False, palette=palt, errorbar=None, edgecolor='black')

# Adding annotations
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 10), 
                   textcoords = 'offset points',
                   fontweight='bold')

# plt.ylim(0, 820000)
plt.title("Total Spending by Age Group", fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Age Group', fontsize=13)
plt.ylabel('Total Spending', fontsize=13)

In [None]:
plt.figure(figsize=(12,8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9', '#D16F11']
barplot = sns.barplot(x='NumChild', y='ConversionRate',hue='NumChild', legend=False, data=dfe, palette=palt, errorbar=None, edgecolor='black')

# Adding annotations
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 10), 
                   textcoords = 'offset points',
                   fontweight='bold')

plt.ylim(0, 2.2)
plt.title("Customer Conversion Rate by Number of Children", fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Number of Children', fontsize=13.5)
plt.ylabel('Conversion Rate', fontsize=13.5)

In [None]:
plt.figure(figsize=(10,8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B']
barplot = sns.barplot(x='Parent', y='ConversionRate',hue='Parent', data=dfe, legend=False, palette=palt, errorbar=None, edgecolor='black')

# Add annotations
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'), 
                     (p.get_x() + p.get_width() / 2., p.get_height()), 
                     ha = 'center', va = 'center', 
                     xytext = (0, 10), 
                     textcoords = 'offset points',
                     fontweight='bold')

plt.ylim(0, 2.3)
plt.title('Conversion Rate by Parental Status', fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Parental Status', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)

In [None]:
plt.figure(figsize=(10,8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9', '#D16F11', '#6F11D1']
ed_order = ['SMA', 'D3', 'S1', 'S2', 'S2']
barplot = sns.barplot(x='Education', y='ConversionRate',hue='Education', data=dfe, order=ed_order, legend=False, palette=palt, errorbar=None, edgecolor='black')

# Add annotations
for p in barplot.patches:
    height = p.get_height()
    barplot.text(p.get_x()+p.get_width()/2.,
            height + 0.01,
            '{:1.2f}'.format(height),
            ha="center") 
    
plt.ylim(0, 1.28)
plt.title('Conversion Rate by Education Level', fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Education', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)


In [None]:
num = ['Income', 'Recency', 'NumWebVisitsMonth',
       'Complain', 'Response', 'Age', 'NumChild', 'TotalAcceptedCmp',
       'TotalSpending', 'TotalTrx', 'ConversionRate']
plt.figure(figsize=(18,10), facecolor='#E8E8E8')
sns.heatmap(dfe[num].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap', fontsize=18, fontweight='bold', y=1.02)
plt.show()

## Task 2 : Data Cleaning & Preprocessing
Goals : Preparing raw data into clean data ready to be processed by machine learning<br><br>
Objective : 
- Handle Missing Values
- Handle Duplicate Values
- Handle Infinity values 
- Feature Selection 
- Feature Encoding
- Standarization

#### Handle missing values

In [None]:
# make a copy of previous dataframe for next step (Data Preprocessing)
dfp = dfe.copy()

# Print missing values
missing_col = dfp.isna().sum()
display_missing_col = missing_col[missing_col > 0]
print(f'Missing Values : \n \n{display_missing_col}')

In [None]:
missing = dfp.isnull().sum()*100 / len(dfp)

percentage_missing = pd.DataFrame({'column':dfp.columns,
                                   'missing_percentage %':missing.values})
percentage_missing['missing_percentage %'] = percentage_missing['missing_percentage %'].round(2)
percentage_missig = percentage_missing.sort_values('missing_percentage %', ascending=False)
percentage_missing = percentage_missing.reset_index()
percentage_missing = percentage_missing.drop('index', axis=1)

plt.figure(figsize=(10,8), facecolor='#E8E8E8')
ax = sns.barplot(x='missing_percentage %', y='column', data=percentage_missing, color='#E1341E')
for p in ax.patches:
    ax.annotate('%.2f' % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                xytext=(8,0), textcoords='offset points', ha='left', va='center', fontsize=10)
plt.title('Percentage of Missing Data', fontsize=17, fontweight='bold')
plt.ylabel('Column', fontsize=12, fontweight='bold')
plt.xlabel('Percentage', fontsize=12, fontweight='bold')
plt.xlim(0,1.5)
plt.show()

In [None]:
missing_cr = dfp[['NumWebPurchases', 'NumWebVisitsMonth', 'ConversionRate']]
missing_crdf = missing_cr[missing_cr.isna().any(axis=1)]

print(f"Highlighted Missing values : \n")
display(missing_crdf)
print('*Conversion Rate not missing at Random*')

In [None]:
plt.figure(figsize=(7, 5), facecolor='#E8E8E8')
sns.kdeplot(data=dfp, x='Income', fill=True, color='#D1106F')
plt.title('Income')

plt.tight_layout()
plt.show()

In [None]:
# print total null on income and conversion rate
total_null_income = dfp['Income'].isna().sum()
total_null_conrate = dfp['ConversionRate'].isna().sum()
print(f"Total Missing Values on Income Column = {total_null_income}")
print(f"Total Missing Values on Conversion Rate Column = {total_null_conrate}")

# print median income
median_income = dfp['Income'].median()
print(f"\nIncome Median to fill the missing value: {median_income}")

# handle missing values with fill and drop method
dfp['Income'].fillna(dfp['Income'].median(), inplace=True)
dfp.dropna(subset=['ConversionRate'], inplace=True)

# checking missing values if still exist
nonull_income = dfp['Income'].isna().sum()
nonull_conrate = dfp['ConversionRate'].isna().sum()
print(f"\nMissing Values on Income Column after handling = {nonull_income}")
print(f"Missing Values on Conversion Rate Column after handling = {nonull_conrate}")

#### No Duplicates

In [None]:
total_duplicate = dfp.duplicated().sum()
print(f"Total Duplicated Data = {total_duplicate}")

#### Fix the Infinity Value On Conversion Rate Features

In [None]:
# Print count Infiinity values in dataframe
count_inf = dfp.map(lambda x: isinstance(x, float) and x == float('inf')).sum().sum()
print(f"Count of Infinity Values :\nIt Contains {str(count_inf)} Infinite values in dataframe")

# print column where infinity values exist
col_inf = dfp.columns[dfp.map(lambda x: isinstance(x, float) and x == float('inf')).any()]
print("\nColumns where Infinity values exist:")
print(", ".join(col_inf))

In [None]:
# Replace infinity values with NaN
dfp.replace([np.inf, -np.inf], np.nan, inplace=True)

print(f"Dataframe Entries before dropping infinity values {len(dfp)}")

# Drop infinity value as nan value
dfp.dropna(inplace=True)

print(f"\nDataframe Entries After dropping infinity values {len(dfp)}")

no_inf = dfp.map(lambda x: isinstance(x, float) and x == float('inf')).sum().sum()
print(f"\nChecking if inifinity values still exist in dataframe : {str(no_inf)}")

#### Handle Outliers

In [None]:
def remove_outliers(data, columns):
    result = dfp.copy()
    for col in columns:
        Q1 = result[col].quantile(0.25)
        Q3 = result[col].quantile(0.75)
        IQR = Q3 - Q1
        result = result[~((result[col] < (Q1 - 1.5 * IQR)) |(result[col] > (Q3 + 1.5 * IQR)))]
    return result

outliers = ['Income', 'TotalSpending', 'TotalTrx', 'ConversionRate']
dfp = remove_outliers(dfp, outliers)

#### Feature Selection

In [None]:
dfp_slctd = dfp[['Income', 'Recency', 'Age', 'TotalSpending', 'TotalTrx', 'ConversionRate']].copy()

uncssry = ['Unnamed: 0', 'ID', 'Year_Birth', 'Kidhome', 'Teenhome', 'Dt_Customer', 'MntCoke', 'MntFruits', 
           'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts','MntGoldProds', 'NumDealsPurchases', 
           'NumWebPurchases','NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 
           'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response', 'Age']
print(f"drop unecessary features and redundant features : \n{uncssry}")

display(dfp_slctd.sample(5))

#### Feature Encoding
Features to label Encode :<br>
- Education
- Age Group

Features to One Hot Encode: <br>
- Marital_Status
- Parent

In [None]:
# # Label Encding
# # Initialize Label Encoder as le
# le = LabelEncoder()

# dfp_slctd['Education'] = le.fit_transform(dfp_slctd['Education'])
# dfp_slctd['AgeGroup'] = le.fit_transform(dfp_slctd['AgeGroup'])


# # One hot Encoding
# ms_encoded = pd.get_dummies(dfp_slctd['Marital_Status'], prefix='Status').astype(int)
# dfp_slctd = pd.concat([dfp_slctd, ms_encoded], axis=1)

# parent_encoded = pd.get_dummies(dfp_slctd['Parent'], prefix='Parent').astype(int)
# dfp_slctd = pd.concat([dfp_slctd, parent_encoded], axis=1)

# # drop marital status and parent column after encoded(redundant)
# dfp_slctd.drop(columns=['Marital_Status', 'Parent'], inplace=True)

# print('\ndataframe after feature encoding :')
# display(dfp_slctd.head())

#### Standarization

In [None]:
# Inititalize standard scaler as scaler
scaler = StandardScaler()
# Standardize the data
scaled_data = scaler.fit_transform(dfp_slctd)

# new dataframe with scaled data
scaled_dfp = pd.DataFrame(scaled_data, columns=dfp_slctd.columns)

print('\ndataframe after scaled(standarized) :')
scaled_dfp.head()

## Task 3 : Modelling
Goals : Group customers into several clusters<br><br>
Objective : 
Apply the k-means clustering algorithm to the existing dataset, choose the correct number of clusters by looking at the elbow method, and evaluate using the silhouette score.

### PCA 1st

In [None]:
pca = PCA(n_components=2)
dfpca = pd.DataFrame(pca.fit_transform(scaled_dfp))
dfpca.rename(columns={0:'PC1', 1:'PC2'}, inplace=True)

### Find the optimal n cluster with Elbow Method and Silhouette Method 

In [None]:
sse = {};sil = [];kmax = 10
fig = plt.subplots(nrows = 1, ncols = 2, figsize = (20,5), facecolor='#E8E8E8')

# Elbow Method :
plt.subplot(1,2,1)
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, n_init=10).fit(dfpca)
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
sns.lineplot(x = list(sse.keys()), y = list(sse.values()))
sns.scatterplot(x = list(sse.keys()), y = list(sse.values()), s=50);
plt.title('Elbow Method')
plt.xlabel("k : Number of cluster")
plt.ylabel("Sum of Squared Error")
plt.grid()

# Silhouette Score Method
plt.subplot(1,2,2)
for k in range(2, kmax + 1):
    kmeans = KMeans(n_clusters = k, random_state=511, n_init=10).fit(dfpca)
    labels = kmeans.labels_
    sil.append(silhouette_score(dfpca, labels, metric = 'euclidean'))
sns.lineplot(x = range(2,kmax + 1), y = sil)
sns.scatterplot(x = range(2,kmax + 1), y = sil);
plt.title('Silhouette Score Method')
plt.xlabel("k : Number of cluster")
plt.ylabel("Silhouette Score")
plt.grid()

plt.show()



In [None]:
fig = plt.figure(figsize=(12,8))

# Instantiate the clustering model and visualizer
model = KMeans(random_state=511, n_init=10)
visualizer = KElbowVisualizer(model, k=(2,10))

visualizer.fit(dfpca)    
visualizer.poof()  

In [None]:
inertia = []
silhouette = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=randomstate, n_init="auto")
    kmeans.fit(dfpca)
    inertia.append(kmeans.inertia_)
    cluster_label = kmeans.labels_
    silhouette.append(silhouette_score(dfpca, cluster_label))


fig, ax1 = plt.subplots()

ax1.set_xlabel("k")
ax1.set_ylabel("inertia score", color="tab:blue")
ax1.plot(
    range(2, 10), inertia, marker="o", linestyle="--", color="tab:blue", label="inertia"
)
ax1.tick_params(axis="y", labelcolor="tab:blue")

ax2 = ax1.twinx()

ax2.set_ylabel("silhouette score", color="tab:red")
ax2.plot(
    range(2, 10),
    silhouette,
    marker="o",
    linestyle="--",
    color="tab:red",
    label="silhouette",
)
ax2.tick_params(axis="y", labelcolor="tab:red")

lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc="upper right")

plt.title("Inertia-Silhouette Score")
plt.show()

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 8))
for i in range(2, 6):
    kmeans = KMeans(n_clusters=i, random_state=randomstate, n_init='auto')
    q, mod = divmod(i, 2)
    visualizer = SilhouetteVisualizer(kmeans, colors="yellowbrick", ax=ax[q - 1][mod])
    visualizer.fit(dfpca)
    ax[q - 1][mod].set_title(f'Silhouette plot for {i} clusters')

optimal n_cluster = 4

In [None]:
k_optimal = 5
kmeans = KMeans(n_clusters=k_optimal, random_state=randomstate, n_init='auto')
kmeans.fit(dfpca)
dfpca.loc[:, 'k_cluster'] = kmeans.predict(dfpca)
label = kmeans.predict(dfpca)
dfpca

In [None]:
plt.figure(figsize=(12,8), facecolor='#E8E8E8')
sns.scatterplot(x='PC1', y='PC2', hue='k_cluster', data=dfpca, palette='Set1')

centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], c='black', s=200, alpha=0.8, marker='x')

plt.title('K-Means Clustering', fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('PCA 1', fontsize=12)
plt.ylabel('PCA 2', fontsize=12)
plt.show()


In [None]:
dfp_slctd.loc[:, 'k_cluster'] = label
dfp_slctd

In [None]:
sns.boxplot(x='k_cluster', y='ConversionRate', data=dfp_slctd)

In [None]:
sns.pairplot(dfp_slctd, hue='k_cluster', palette='Set1')