In [77]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [78]:
filepath = '/kaggle/input/customer-personality-analysis/'
all_data = pd.read_csv(filepath+'marketing_campaign.csv',sep='\t',index_col='ID')
all_data.head(10)

In [79]:
all_data.Marital_Status.value_counts()

In [80]:
all_data['Kids'] = all_data['Kidhome']+all_data['Teenhome']
all_data.head()

In [81]:
all_data = all_data.drop(columns=['Kidhome', 'Teenhome'])

In [82]:
all_data['Age'] = 2015 - all_data['Year_Birth']
all_data = all_data.drop(columns=['Year_Birth'])

In [83]:
all_data.head()

In [84]:
all_data = all_data.assign(is_married=['Relationship' if x in ['Married','Together'] else 'Single' for x in all_data['Marital_Status']]).drop(columns=['Marital_Status'])

In [85]:
all_data['Dt_parsed'] = pd.to_datetime(all_data['Dt_Customer'],format = "%d-%m-%Y")
all_data = all_data.drop(columns=['Dt_Customer'])

In [86]:
basedate = pd.Timestamp('2015-12-31')
all_data['Enroll_days'] = (basedate - all_data['Dt_parsed']).dt.days
all_data = all_data.drop(columns=['Dt_parsed'])

In [87]:
all_data = all_data.assign(Education=['Postgrad' if x in ['PhD','Master','2n Cycle','Graduation'] else 'Undergrad' for x in all_data['Education']])

In [88]:
all_data = all_data.drop(columns=['Z_CostContact','Z_Revenue'])

## See correlations of all features before clustering

In [89]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize= (20, 15))
# Mask to hide upper-right part of plot as it is a duplicate
mask = np.transpose(np.tril(np.ones(all_data.corr().shape)))
sns.heatmap(all_data.corr(), annot = True, center = 0, cmap = 'RdBu', mask = mask);

### Observations from correlation map:
#### 1) Amount of spending on the different categories (wine, meat, fruit, etc.) are all positively correlated, and the correlations among them are quite even ==> Combine these columns into 1 column "Expenses". The same goes for NumPurchases columns.
#### 2) The "AcceptedCmp" columns are very similar to each other. Aggregating them might be more helpful ==> Combine them into 1 column "Total accepted Cmp".
#### 3) "Complain" "Recency" are not informative.

In [90]:
all_data['Expenses'] = all_data['MntWines'] + all_data['MntFruits'] + all_data['MntMeatProducts'] + all_data['MntFishProducts'] + all_data['MntSweetProducts'] +all_data['MntGoldProds']
all_data['TotalAcceptedCmp'] = all_data['AcceptedCmp1'] + all_data['AcceptedCmp2'] + all_data['AcceptedCmp3'] + all_data['AcceptedCmp4'] + all_data['AcceptedCmp5'] + all_data['Response']
all_data['NumPurchases'] = all_data['NumDealsPurchases'] + all_data['NumWebPurchases'] + all_data['NumCatalogPurchases'] + all_data['NumStorePurchases']
all_data = all_data.drop(columns = ['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds','AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','Response','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases', 'Complain','Recency'])
all_data.head(10)

In [91]:
plt.figure(figsize= (20, 15))
# Mask to hide upper-right part of plot as it is a duplicate
mask = np.transpose(np.tril(np.ones(all_data.corr().shape)))
sns.heatmap(all_data.corr(), annot = True, center = 0, cmap = 'RdBu', mask = mask);

In [92]:
import plotly.express as px
fig = px.bar(all_data, x='Enroll_days', y='Expenses',color = 'Enroll_days')
fig.show()

In [93]:
all_data.info()
cont_features = all_data.iloc[:, 1:]

In [94]:
import matplotlib
background_color = "#f6f5f5"

fig = plt.figure(figsize=(12, 8), facecolor=background_color)
gs = fig.add_gridspec(1, 1)
ax0 = fig.add_subplot(gs[0, 0])
colors = ["#2f5586", "#f6f5f5","#2f5586"]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

ax0.set_facecolor(background_color)
ax0.text(-1.1, 1.25, 'Correlation of Continuous Features with Target', fontsize=20, fontweight='bold')

chart_df = pd.DataFrame(cont_features.corrwith(all_data['Expenses']))
chart_df.columns = ['corr']
sns.barplot(x=chart_df.index, y=chart_df['corr'], ax=ax0, color='mediumblue', zorder=3, linewidth=2)
ax0.grid(which='major', axis='x', zorder=0, linewidth=0.4)
ax0.grid(which='major', axis='y', zorder=0, linewidth=0.4)
ax0.set_ylabel('')

for s in ["top","right", 'left']:
    ax0.spines[s].set_visible(False)

plt.show()


#### Summary:
#### 1) Expenses is highly correlated with Total Number of Purchases (0.75), Income (0.67), and moderately correlated with Total Accepted Campaigns.
#### 2) Expenses is negatively correlated with Number od Kids and Number of Web Visits.
#### 3) No-so relevant factors: Age, Enroll history.

## Clustering

### one-hot encoding

In [95]:
# X = all_data.copy()
# X = pd.get_dummies(X)
# X.head()
# from sklearn.model_selection import train_test_split

# X_train, X_valid = train_test_split(all_data,train_size=0.8, test_size=0.2,random_state=0)

### Label Encoding

In [96]:
X = all_data.copy()
X.head()

In [97]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X['Education'] = label_encoder.fit_transform(X['Education'])
X['is_married'] = label_encoder.fit_transform(X['is_married'])

In [98]:
# Number of missing values in each column of training data
missing_val_count_by_column = (X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

### simple imputing

In [99]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X))
X_imputed.columns = X.columns
X_imputed.index = X.index
#X_valid = pd.DataFrame(imputer.fit_transform(X_valid))

In [100]:
X = X_imputed
X.head()

In [101]:
X['Expense_Income_Ratio'] = X['Expenses']/X['Income']

In [102]:
X.head()

In [103]:
X[X['Income']<=65000]['Expense_Income_Ratio'].describe()

In [104]:
X['NumPur_Income_Ratio'] = X['NumPurchases']/X['Income']
X.head()

In [105]:
X[X['Income']<=65000]['NumPur_Income_Ratio'].describe()

In [106]:
plt.figure(figsize= (15,8))
plt.scatter(X['Income'],X['Expense_Income_Ratio'],s = 5,c = X['NumPurchases'])
cbar = plt.colorbar()
plt.xlim(0,110000)
plt.ylim(0,0.04)

In [107]:
X = X.drop(columns = ['NumPur_Income_Ratio'])
X.head()

In [108]:
X.info

### standard scaling (only numerical columns)

In [109]:
X.columns

In [110]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# numer_cols = ['Income', 'Expenses', 'NumPurchases',
#               'NumWebVisitsMonth', 'Kids', 'Age', 'Enroll_days','TotalAcceptedCmp','Expense_Income_Ratio']#,'Recency']

# categ_cols = ['Education', 'is_married']#,'AcceptedCmp3','Complain']

In [111]:
# numer = X[numer_cols]
# categ = X[categ_cols]
# numer.head()

In [112]:
# numer_scaled = pd.DataFrame(StandardScaler().fit_transform(numer))
# numer_scaled.columns = numer.columns
# numer_scaled.head()

In [113]:
# X = pd.merge(numer_scaled, categ, how = 'left', left_index=True, right_index=True)#, axis=1)#, join='inner')
# #X = X_scaled
# X.head()

In [114]:
X_scaled = pd.DataFrame(StandardScaler().fit_transform(X))
X_scaled.columns = X.columns
X = X_scaled
X.head()

In [115]:
X=X.drop(columns=['Age','Enroll_days'])

In [116]:
X.shape

### Build clusters.

#### Elbow methods to check how many clusters are needed.

In [117]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 25):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 50)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 25), wcss,color = "mediumblue",marker = '*')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [118]:
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)
clusters = kmeans.fit_predict(X)
kmeans.cluster_centers_

In [119]:
X["Cluster"] = clusters
X.head()

In [120]:
from sklearn import metrics
labels = kmeans.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

In [121]:
metrics.calinski_harabasz_score(X, labels)

In [122]:
metrics.davies_bouldin_score(X, labels)

In [123]:
X.info

### Simple visualization (w/ selected features)
#### By changing feature1 and feature2, the personality and spending behaviors of different segments can be visualized.

In [124]:
plt.figure(figsize= (15,8))
feature1, feature2 = 'Income','Expense_Income_Ratio'
colors = {0:'steelblue',1:'salmon',2:"gray",3:'forestgreen',4:'magenta'}
plt.scatter(X[feature1],X[feature2],c=X['Cluster'].map(colors),alpha = 0.5)
plt.xlabel(feature1,fontsize=15)
plt.ylabel(feature2,fontsize=15)
#plt.yscale('log')
#plt.show()
plt.xlim(-2,3)
plt.ylim(-1,2)

In [127]:
plt.figure(figsize= (15,8))
feature1, feature2 = 'Income','Expenses'
colors = {0:'steelblue',1:'salmon',2:"gray",3:'forestgreen',4:'magenta'}
plt.scatter(X[feature1],X[feature2],c=X['Cluster'].map(colors),alpha = 0.5)
plt.xlabel(feature1,fontsize=15)
plt.ylabel(feature2,fontsize=15)
#plt.yscale('log')
#plt.show()
plt.xlim(-2,5)
#plt.ylim(-1,2)

In [133]:
plt.figure(figsize= (15,8))
feature1, feature2 = 'NumPurchases','Expense_Income_Ratio'
colors = {0:'steelblue',1:'salmon',2:"gray",3:'forestgreen',4:'magenta'}
plt.scatter(X[feature1],X[feature2],c=X['Cluster'].map(colors),alpha = 0.5)
plt.xlabel(feature1,fontsize=15)
plt.ylabel(feature2,fontsize=15)
#plt.yscale('log')
#plt.show()
plt.xlim(-2,3)
plt.ylim(-1,2)

### Brief Summary:
#### 1) Customers can be segmented into 2-3 clusters for pratical purposes.
#### 2) If segmented into 2 groups -- Salmon-colored customers feature as lower income, lower number of purchases, (slightly) more kids, (slightly) less educated, and accepted less campagign discounts. Blue-colored customers are the opposite.
#### 3) Lower-income customers (< 50000) have a fixed budget on spending based on their income. While Higher-income customers' expense are not bound to income, it varies a lot. So there's a lot more room to do marketing on this segment.
#### 4) For lower-income customers, expense increases linearly with the number of purchases. But making higher-income customers shop more times does not increase how much they spend. So instead, we can focus on making higher-income customers spend more on a single purchase. Like providing them site-wide coupons instead of discounts on specific items.

Side note:  
Income boundary ~ 50,000 for lower-income customers. A transition zone between 40,000 and 60,000.  
Lower-income: higher-income ~ 2:1 in customer counts.   
Lower-income expenses : higher-income expenses ~ 1:2. So 1 higher-income person spends almost 4 times more money than 1 lower-income customer.  
  
Company should focus more on higher-income customers since they spend the most among customers.   
Potential strategies --  
1) Retention: For existing customers, a. conduct surveys to better understand their preference, like the brands they like, what aspects of goods the care about the most, etc. b. predict what they'll like for targeted promotions (recommender system) c. loyalty program.  
2) Aquisition: Attract more high-income customers: referral program, introduce their friends; targeted sign-up bonus (free/discounted gifts this group will like).

### Use PCA to visualize

In [38]:
from sklearn.decomposition import PCA

In [39]:
# #plotX is a DataFrame containing 5000 values sampled randomly from X
# plotX = pd.DataFrame(np.array(X.sample(5000)))

# #Rename plotX's columns since it was briefly converted to an np.array above
# plotX.columns = X.columns
plotX = X
plotX.head()

In [40]:
#PCA with one principal component
pca_1d = PCA(n_components=1)

#PCA with two principal components
pca_2d = PCA(n_components=2)

#PCA with three principal components
pca_3d = PCA(n_components=3)

In [41]:
#This DataFrame holds that single principal component mentioned above
PCs_1d = pd.DataFrame(pca_1d.fit_transform(plotX.drop(["Cluster"], axis=1)))

#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["Cluster"], axis=1)))

In [42]:
PCs_1d.columns = ["PC1_1d"]

#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
PCs_2d.columns = ["PC1_2d", "PC2_2d"]

PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

In [43]:
plotX = pd.concat([plotX,PCs_1d,PCs_2d,PCs_3d], axis=1, join='inner')
plotX.head()

In [44]:
plotX["dummy"] = 0

In [45]:
#Note that all of the DataFrames below are sub-DataFrames of 'plotX'.
#This is because we intend to plot the values contained within each of these DataFrames.

cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
cluster2 = plotX[plotX["Cluster"] == 2]

In [46]:
#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [47]:
#This is needed so we can display plotly plots properly
init_notebook_mode(connected=True)

In [48]:
#Instructions for building the 1-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_1d"],
                    y = cluster0["dummy"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_1d"],
                    y = cluster1["dummy"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_1d"],
                    y = cluster2["dummy"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in One Dimension Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [49]:
#Instructions for building the 2-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_2d"],
                    y = cluster2["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [50]:
#Instructions for building the 3-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2["PC1_3d"],
                    y = cluster2["PC2_3d"],
                    z = cluster2["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)