# Predict Customer Personality to Boost Marketing Campaign by Using Machine Learning

## Task 1 : Conversion Rate Analysis Based On Income, Spending And Age
Goals : Find a pattern of consumer behavior.<br>
Objective : 
- Feature engineering 
- Analyze Conversion Rate with other variables such as age, income, expenses, etc 

### Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load Data

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('./data/marketing_campaign_data.csv')
df.sample(10)

In [None]:
df.info()

### Feature Engineering
New Features :
- Age                = age for each customer
- AgeGroup           = age group for better interpretation in analysis ahead
- Parent             = is the customer have kid or not
- NumChild           = how many child do the customer have?
- TotalAcceptedCmp   = How many campaigns does the customer receive after the campaign is carried out?
- Total Trx          = How many transaction the customer do in our store?
- Online Trx         = How many online transaction the customer generate on our platform? 
- ConversionRate     = the percentage of website visitors who complete a web purchase

In [None]:
# make a copy of df for feature engineering
dfe = df.copy()

# new column age
dfe['Age'] = 2024 - dfe['Year_Birth']

# new column age group
age_grouping = [
    (dfe['Age'] >= 60),
    (dfe['Age'] >= 40 ) & (dfe['Age'] < 60),
    (dfe['Age'] >= 28) & (dfe['Age'] < 40)
]
age_category = ['Old Adults', 'Middled-aged Adults', 'Young Adults']
dfe['AgeGroup'] = np.select(age_grouping, age_category)

# new column HasKid
def has_kid(row):
    if row['Kidhome'] > 0 or row['Teenhome'] > 0:
        return 'yes'
    else:
        return 'no'
dfe['Parent'] = dfe.apply(has_kid, axis=1)

# Num child column
dfe['NumChild'] = dfe['Kidhome'] + dfe['Teenhome']

# new column TotalAcceptedCmp
dfe['TotalAcceptedCmp'] = dfe['AcceptedCmp1'] + dfe['AcceptedCmp2'] + dfe['AcceptedCmp3'] + dfe['AcceptedCmp4'] + dfe['AcceptedCmp5']

# new column TotalSpending
dfe['TotalSpending'] = dfe['MntCoke'] + dfe['MntFruits'] + dfe['MntMeatProducts'] + dfe['MntFishProducts'] + dfe['MntSweetProducts'] + dfe['MntGoldProds']

# Total Transaction column
dfe['TotalTrx'] = dfe['NumDealsPurchases'] + dfe['NumWebPurchases'] + dfe['NumCatalogPurchases'] + dfe['NumStorePurchases']

# ConversionRate column
dfe['ConversionRate'] =  dfe['NumWebPurchases'] / dfe['NumWebVisitsMonth']

In [None]:
dfe[['Education', 'Marital_Status', 'Income','Recency','NumWebVisitsMonth',
       'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'AgeGroup', 'Parent', 'NumChild', 'TotalAcceptedCmp',
       'TotalSpending', 'TotalTrx', 'ConversionRate']].sample(10)

In [None]:
dfe.describe()

### EDA

In [None]:
plt.figure(figsize=(12,8), facecolor='#E8E8E8')
sns.scatterplot(x='Income', y='ConversionRate', data=dfe, color='#D1106F')

plt.xlim(0, 200000000)
plt.ylim(0, 4.7)

plt.axvline(x=110000000, color='b', linestyle='--') 

plt.title("Customer Conversion Rate and Income Correlation", fontsize=19, fontweight='bold', y=1.02)
plt.xlabel('Income', fontsize=13.5)
plt.ylabel('Conversion Rate', fontsize=13.5)


In [None]:
plt.figure(figsize=(12,8), facecolor='#E8E8E8')
sns.scatterplot(x='TotalSpending', y='Income', data=dfe, color='#D1106F')
plt.ylim(0, 122000000)
plt.xlim(0, 2700000)
plt.axvline(x=2540000, color='b', linestyle='--') # Vertical line at x=100000000
plt.title('Customer Income and Total Spending Correlation', fontsize=17, fontweight='bold', y=1.03)
plt.xlabel('Total Spending', fontsize=13.5)
plt.ylabel('Income', fontsize=13.5)

In [None]:
plt.figure(figsize=(12,8), facecolor='#E8E8E8')
sns.scatterplot(x='TotalSpending', y='ConversionRate', data=dfe, color='#D1106F')
plt.ylim(0, 3.8)
plt.title('Correlation Between Conversion Rate and Total Spending', fontsize=18, fontweight='bold', y=1.02)
plt.xlabel('Total Spending', fontsize=13.5)
plt.ylabel('Conversion Rate', fontsize=13.5)

In [None]:
# Get counts of each age group
age_counts = dfe['AgeGroup'].value_counts()
palt = ['#00D19B','#D1106F' ,'#25A9D9']

# Create pie chart
plt.figure(figsize=(12, 8), facecolor='#E8E8E8')
patches, texts, autotexts = plt.pie(age_counts, colors=palt, autopct='%1.1f%%', textprops={'size': 13})

# Legend
plt.legend(patches, age_counts.index, loc="best")

plt.title("Distribution of Customer by Age Group", fontsize=18, fontweight='bold', y=1.03)
plt.show()

In [None]:
# Get counts of each age group
parent_counts = dfe['Parent'].value_counts()
palt = ['#00D19B','#D1106F']

# Create pie chart
plt.figure(figsize=(12, 8), facecolor='#E8E8E8')
patches, texts, autotexts = plt.pie(parent_counts, colors=palt, autopct='%1.1f%%', textprops={'size':13})

# Add legend
plt.legend(patches, parent_counts.index, loc="best")

plt.title("Parent Customer Distribution", fontsize=18, fontweight='bold', y=1.02, x=0.54)
plt.show()

In [None]:

plt.figure(figsize=(10, 8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9']
age_order = ['Young Adults', 'Middled-aged Adults', 'Old Adults']
barplot = sns.barplot(data=dfe, x='AgeGroup', y='ConversionRate',hue='AgeGroup', order=age_order, legend=False, palette=palt, errorbar=None, edgecolor='black')
# Add annotations
for p in barplot.patches:
    height = p.get_height()
    barplot.text(p.get_x()+p.get_width()/2.,
            height + 0.01,
            '{:1.2f}'.format(height),
            ha="center",
            fontweight='bold') 

plt.ylim(0, 1.5)
plt.title("Conversion Rate by Age Group", fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)

In [None]:
plt.figure(figsize=(10, 8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9']
age_order = ['Young Adults', 'Middled-aged Adults', 'Old Adults']
barplot = sns.barplot(data=dfe, x='AgeGroup', y='TotalSpending',hue='AgeGroup', order=age_order, legend=False, palette=palt, errorbar=None, edgecolor='black')

# Adding annotations
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 10), 
                   textcoords = 'offset points',
                   fontweight='bold')

plt.ylim(0, 820000)
plt.title("Total Spending by Age Group", fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Age Group', fontsize=13)
plt.ylabel('Total Spending', fontsize=13)

In [None]:
plt.figure(figsize=(10, 8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9']
age_order = ['Young Adults', 'Middled-aged Adults', 'Old Adults']
barplot = sns.barplot(data=dfe, x='AgeGroup', y='TotalAcceptedCmp',hue='AgeGroup', order=age_order, legend=False, palette=palt, errorbar=None, edgecolor='black')

# Adding annotations
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 10), 
                   textcoords = 'offset points',
                   fontweight='bold')

# plt.ylim(0, 820000)
plt.title("Total Spending by Age Group", fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Age Group', fontsize=13)
plt.ylabel('Total Spending', fontsize=13)

In [None]:
plt.figure(figsize=(12,8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9', '#D16F11']
barplot = sns.barplot(x='NumChild', y='ConversionRate',hue='NumChild', legend=False, data=dfe, palette=palt, errorbar=None, edgecolor='black')

# Adding annotations
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 10), 
                   textcoords = 'offset points',
                   fontweight='bold')

plt.ylim(0, 2.2)
plt.title("Customer Conversion Rate by Number of Children", fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Number of Children', fontsize=13.5)
plt.ylabel('Conversion Rate', fontsize=13.5)

In [None]:
plt.figure(figsize=(10,8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B']
barplot = sns.barplot(x='Parent', y='ConversionRate',hue='Parent', data=dfe, legend=False, palette=palt, errorbar=None, edgecolor='black')

# Add annotations
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'), 
                     (p.get_x() + p.get_width() / 2., p.get_height()), 
                     ha = 'center', va = 'center', 
                     xytext = (0, 10), 
                     textcoords = 'offset points',
                     fontweight='bold')

plt.ylim(0, 2.3)
plt.title('Conversion Rate by Parental Status', fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Parental Status', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)

In [None]:
plt.figure(figsize=(10,8), facecolor='#E8E8E8')
palt = ['#D1106F','#00D19B' ,'#25A9D9', '#D16F11', '#6F11D1']
ed_order = ['SMA', 'D3', 'S1', 'S2', 'S2']
barplot = sns.barplot(x='Education', y='ConversionRate',hue='Education', data=dfe, order=ed_order, legend=False, palette=palt, errorbar=None, edgecolor='black')

# Add annotations
for p in barplot.patches:
    height = p.get_height()
    barplot.text(p.get_x()+p.get_width()/2.,
            height + 0.01,
            '{:1.2f}'.format(height),
            ha="center") 
    
plt.ylim(0, 1.28)
plt.title('Conversion Rate by Education Level', fontsize=18, fontweight='bold', y=1.03)
plt.xlabel('Education', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)


In [None]:
num = ['Income', 'Recency', 'NumWebVisitsMonth',
       'Complain', 'Response', 'Age', 'NumChild', 'TotalAcceptedCmp',
       'TotalSpending', 'TotalTrx', 'ConversionRate']
plt.figure(figsize=(18,10), facecolor='#E8E8E8')
sns.heatmap(dfe[num].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap', fontsize=18, fontweight='bold', y=1.02)
plt.show()

### Data Preprocessing

#### Handle missing values

In [None]:
# make a copy of previous dataframe for next step (Data Preprocessing)
dfp = dfe.copy()

# Print missing values
missing_col = dfp.isna().sum()
missing_col = missing_col[missing_col > 0]
print(f'Missing Values : \n \n{missing_col}')

In [None]:
plt.figure(figsize=(12, 5), facecolor='#E8E8E8')

# First plot
plt.subplot(1, 2, 1)
sns.kdeplot(data=dfp, x='Income', fill=True, color='#D1106F')
plt.title('Income')

# Second plot
plt.subplot(1, 2, 2)
sns.kdeplot(data=dfp, x='ConversionRate', fill=True, color='#D1106F')
plt.title('Conversion Rate')

plt.tight_layout()
plt.show()

In [None]:
print(f"Total Missing Values on Income Column = {dfp['Income'].isna().sum()}")
print(f"Total Missing Values on Conversion Rate Column = {dfp['ConversionRate'].isna().sum()}")

print(f"\nIncome Median to fill the missing value: {dfp['Income'].median()}")
print(f"Conversion Rate Median to fill the missing value: {dfp['ConversionRate'].median()}")

dfp['Income'].fillna(dfp['Income'].median(), inplace=True)
dfp['ConversionRate'].fillna(dfp['ConversionRate'].median(), inplace=True)


print(f"\nMissing Values on Income Column after handling = {dfp['Income'].isna().sum()}")
print(f"Missing Values on Conversion Rate Column after handling = {dfp['ConversionRate'].isna().sum()}")


#### No Duplicates

In [None]:
print(f"Total Duplicated = {dfp.duplicated().sum()}")

#### Fix the Infinity Value On Conversion Rate Features

#### Feature Selection

In [None]:
dfp.columns

In [None]:
dfp_slctd = dfp[[
    #    'Unnamed: 0', 'ID', 'Year_Birth', 
       'Education', 'Marital_Status','Income', 
    #    'Kidhome', 'Teenhome', 'Dt_Customer', 
       'Recency', 
    #    'MntCoke','MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts','MntGoldProds',
    #    'NumDealsPurchases', 'NumWebPurchases','NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
    #    'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue',
    #    'Response','Age',
       'AgeGroup', 'Parent', 'NumChild', 'TotalAcceptedCmp',
       'TotalSpending', 'TotalTrx', 'ConversionRate'
]].copy()

uncssry = ['Unnamed: 0', 'ID', 'Year_Birth', 'Kidhome', 'Teenhome', 'Dt_Customer', 'MntCoke', 'MntFruits', 
           'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts','MntGoldProds', 'NumDealsPurchases', 
           'NumWebPurchases','NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 
           'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response', 'Age']
print(f"drop unecessary features and redundant features : \n{uncssry}")

In [None]:
# new dataframe
dfp_slctd

#### Feature Encoding

Features to label Encode :<br>
- Education
- Age Group

Features to One Hot Encode: <br>
- Marital_Status
- Parent

In [None]:
# Label Encding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

dfp_slctd['Education'] = le.fit_transform(dfp_slctd['Education'])
dfp_slctd['AgeGroup'] = le.fit_transform(dfp_slctd['AgeGroup'])


# One hot Encoding
ms_encoded = pd.get_dummies(dfp_slctd['Marital_Status'], prefix='Status').astype(int)
dfp_slctd = pd.concat([dfp_slctd, ms_encoded], axis=1)


parent_encoded = pd.get_dummies(dfp_slctd['Parent'], prefix='Parent').astype(int)
dfp_slctd = pd.concat([dfp_slctd, parent_encoded], axis=1)

# drop marital status and parent column after encoded(redundant)
dfp_slctd.drop(columns=['Marital_Status', 'Parent'], inplace=True)

#### Standarization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# Standardize the data
scaled_data = scaler.fit_transform(dfp_slctd)

# new dataframe with scaled data
scaled_dfp = pd.DataFrame(scaled_data, columns=dfp_slctd.columns)
