In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
filepath = '/kaggle/input/customer-personality-analysis/'
all_data = pd.read_csv(filepath+'marketing_campaign.csv',sep='\t',index_col='ID')
all_data.head(10)

In [3]:
all_data.Marital_Status.value_counts()

In [4]:
all_data['Kids'] = all_data['Kidhome']+all_data['Teenhome']
all_data.head()

In [5]:
all_data = all_data.drop(columns=['Kidhome', 'Teenhome'])

In [6]:
all_data['Age'] = 2015 - all_data['Year_Birth']
all_data = all_data.drop(columns=['Year_Birth'])

In [7]:
all_data.head()

In [8]:
all_data = all_data.assign(is_married=[1 if x in ['Married','Together'] else 0 for x in all_data['Marital_Status']]).drop(columns=['Marital_Status'])

In [9]:
all_data['Dt_parsed'] = pd.to_datetime(all_data['Dt_Customer'],format = "%d-%m-%Y")
all_data = all_data.drop(columns=['Dt_Customer'])

In [10]:
basedate = pd.Timestamp('2015-12-31')
all_data['Enroll_days'] = (basedate - all_data['Dt_parsed']).dt.days
all_data = all_data.drop(columns=['Dt_parsed'])

In [11]:
all_data = all_data.assign(Education=['Postgrad' if x in ['PhD','Master','2n Cycle','Graduation'] else 'Undergrad' for x in all_data['Education']])

In [12]:
all_data = all_data.drop(columns=['Z_CostContact','Z_Revenue'])

## See correlations of all features before clustering

In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize= (20, 15))
# Mask to hide upper-right part of plot as it is a duplicate
mask = np.transpose(np.tril(np.ones(all_data.corr().shape)))
sns.heatmap(all_data.corr(), annot = True, center = 0, cmap = 'RdBu', mask = mask);

### Observations from correlation map:
#### 1) Amount of spending on the different categories (wine, meat, fruit, etc.) are all positively correlated, and the correlations among them are quite even ==> Combine these columns into 1 column "Expenses". The same goes for NumPurchases columns.
#### 2) The "AcceptedCmp" columns are very similar to each other. Aggregating them might be more helpful ==> Combine them into 1 column "Total accepted Cmp".
#### 3) "Complain" "Recency" are not informative.

In [14]:
all_data['Expenses'] = all_data['MntWines'] + all_data['MntFruits'] + all_data['MntMeatProducts'] + all_data['MntFishProducts'] + all_data['MntSweetProducts'] +all_data['MntGoldProds']
all_data['TotalAcceptedCmp'] = all_data['AcceptedCmp1'] + all_data['AcceptedCmp2'] + all_data['AcceptedCmp3'] + all_data['AcceptedCmp4'] + all_data['AcceptedCmp5'] + all_data['Response']
all_data['NumPurchases'] = all_data['NumDealsPurchases'] + all_data['NumWebPurchases'] + all_data['NumCatalogPurchases'] + all_data['NumStorePurchases']
all_data = all_data.drop(columns = ['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds','AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','Response','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases', 'Complain','Recency'])
all_data.head(10)

In [15]:
plt.figure(figsize= (20, 15))
# Mask to hide upper-right part of plot as it is a duplicate
mask = np.transpose(np.tril(np.ones(all_data.corr().shape)))
sns.heatmap(all_data.corr(), annot = True, center = 0, cmap = 'RdBu', mask = mask);

In [16]:
import plotly.express as px
fig = px.bar(all_data, x='Enroll_days', y='Expenses',color = 'Enroll_days')
fig.show()

In [17]:
all_data.info()
cont_features = all_data.iloc[:, 1:]

In [18]:
import matplotlib
background_color = "#f6f5f5"

fig = plt.figure(figsize=(12, 8), facecolor=background_color)
gs = fig.add_gridspec(1, 1)
ax0 = fig.add_subplot(gs[0, 0])
colors = ["#2f5586", "#f6f5f5","#2f5586"]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

ax0.set_facecolor(background_color)
ax0.text(-1.1, 1.25, 'Correlation of Continuous Features with Target', fontsize=20, fontweight='bold')

chart_df = pd.DataFrame(cont_features.corrwith(all_data['Expenses']))
chart_df.columns = ['corr']
sns.barplot(x=chart_df.index, y=chart_df['corr'], ax=ax0, color='mediumblue', zorder=3, edgecolor='black', linewidth=2)
ax0.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0.set_ylabel('')

for s in ["top","right", 'left']:
    ax0.spines[s].set_visible(False)

plt.show()


## Clustering

### one-hot encoding

In [19]:
X = all_data.copy()
X = pd.get_dummies(X)
X.head()
# from sklearn.model_selection import train_test_split

# X_train, X_valid = train_test_split(all_data,train_size=0.8, test_size=0.2,random_state=0)

In [20]:
# Number of missing values in each column of training data
missing_val_count_by_column = (X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

### simple imputing

In [21]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X))
X_imputed.columns = X.columns
X_imputed.index = X.index
#X_valid = pd.DataFrame(imputer.fit_transform(X_valid))

In [22]:
X = X_imputed
X.head()

### standard scaling (only numerical columns)

In [23]:
X.columns

In [24]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

numer_cols = ['Income', 'Expenses', 'NumPurchases',
              'NumWebVisitsMonth', 'Kids', 'Age', 'Enroll_days','TotalAcceptedCmp']#,'Recency']

categ_cols = ['Education_Undergrad', 'Education_Postgrad','is_married']#,'AcceptedCmp3','Complain']

In [25]:
numer = X[numer_cols]
categ = X[categ_cols]
numer.head()

In [26]:
numer_scaled = pd.DataFrame(MinMaxScaler().fit_transform(numer))
numer_scaled.columns = numer.columns
numer_scaled.head()

In [27]:
#X = pd.concat([numer_scaled, categ], axis=1, join='inner')
#X.head()

### Build clusters.

#### Elbow methods to check how many clusters are needed.

In [28]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 25):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 50)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 25), wcss,color = "mediumblue",marker = '*')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [29]:
kmeans = KMeans(n_clusters=6, random_state=0)
kmeans.fit(X)
clusters = kmeans.predict(X)
kmeans.cluster_centers_

In [30]:
X["Cluster"] = clusters
X.head()

In [31]:
from sklearn import metrics
labels = kmeans.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

In [32]:
metrics.calinski_harabasz_score(X, labels)

In [33]:
metrics.davies_bouldin_score(X, labels)

### Use PCA to visualize

In [34]:
from sklearn.decomposition import PCA

In [35]:
# #plotX is a DataFrame containing 5000 values sampled randomly from X
# plotX = pd.DataFrame(np.array(X.sample(5000)))

# #Rename plotX's columns since it was briefly converted to an np.array above
# plotX.columns = X.columns
plotX = X
plotX.head()

In [36]:
#PCA with one principal component
pca_1d = PCA(n_components=1)

#PCA with two principal components
pca_2d = PCA(n_components=2)

#PCA with three principal components
pca_3d = PCA(n_components=3)

In [37]:
#This DataFrame holds that single principal component mentioned above
PCs_1d = pd.DataFrame(pca_1d.fit_transform(plotX.drop(["Cluster"], axis=1)))

#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["Cluster"], axis=1)))

In [38]:
PCs_1d.columns = ["PC1_1d"]

#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
PCs_2d.columns = ["PC1_2d", "PC2_2d"]

PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

In [39]:
plotX = pd.concat([plotX,PCs_1d,PCs_2d,PCs_3d], axis=1, join='inner')
plotX.head()

In [40]:
plotX["dummy"] = 0

In [41]:
#Note that all of the DataFrames below are sub-DataFrames of 'plotX'.
#This is because we intend to plot the values contained within each of these DataFrames.

cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
cluster2 = plotX[plotX["Cluster"] == 2]

In [42]:
#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [43]:
#This is needed so we can display plotly plots properly
init_notebook_mode(connected=True)

In [44]:
#Instructions for building the 1-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_1d"],
                    y = cluster0["dummy"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_1d"],
                    y = cluster1["dummy"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_1d"],
                    y = cluster2["dummy"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in One Dimension Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [45]:
#Instructions for building the 2-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_2d"],
                    y = cluster2["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [46]:
#Instructions for building the 3-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2["PC1_3d"],
                    y = cluster2["PC2_3d"],
                    z = cluster2["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)