# link to data description `https://sda.berkeley.edu/sdaweb/docs/scfcomb2019/DOC/hcbkx01.htm#1.HEADING`

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

from dash.dependencies import Input, Output
from dash import dcc, html

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import trimmed_var


In [2]:
from jupyter_dash import JupyterDash
# JupyterDash.infer_jupyter_proxy_config()

Building a dashboard app for customer segmentation with #NETWORTH less than $2 Million  

In [3]:
def wrangle(filepath):
    
    df = pd.read_csv(filepath)
    
    mask = (df["TURNFEAR"] == 1) & (df["NETWORTH"] < 2e6)
    df_fear = df[mask]
    
    return df_fear

In [4]:
df = wrangle("SCFP2019.csv")
print("df shape:", df.shape)
df.head()

df shape: (4418, 351)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
5,2,21,3790.476607,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
6,2,22,3798.868505,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,3,2,2
7,2,23,3799.468393,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
8,2,24,3788.076005,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
9,2,25,3793.066589,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2


In [5]:
app = JupyterDash(__name__)

In [6]:
app.layout = html.Div(
    [
        # Application title header
        html.H1("Survey of Consumer Finances"),
        # Bar chart element
        html.H2("High Variance Features"),
        # Bar chart Graph
        dcc.Graph(id= "bar-chart"),
        
        dcc.RadioItems(options = [
            {"label" : "trimmed", "value" : True},
            {"label" : "not trimmed", "value" : False}
        ],
                        value = True,
                        id = "trim-button"
                      ),
        html.H2("K Means Clustering"),
        html.H3("Number Of Clusters(k)"),
        dcc.Slider(min = 2, max = 12, step = 1, value =2, id = "k-slider"),
        html.Div(id = "metrics"),
        # PCA Scatter Plot
        dcc.Graph(id = "pca-scatter")
    ]


)

In [7]:
def get_high_var_features(trimmed = True, return_feat_names = True):

    """Returns the five highest-variance features of ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    return_feat_names : bool, default=False
        If ``True``, returns feature names as a ``list``. If ``False``
        returns ``Series``, where index is feature names and values are
        variances.
    """
    # calculate variance
    if trimmed:
        top_five_features = (
                df.apply(trimmed_var).sort_values().tail(5)
        )
    else:
        top_five_features = df.var().sort_values().tail(5)
        
    # extract names
    if return_feat_names:
        top_five_features = top_five_features.index.tolist()
    
    return top_five_features

In [8]:
@app.callback(
    Output("bar-chart", "figure"), Input("trim-button", "value")
    )
def serve_bar_chart(trimmed = True):

    """Returns a horizontal bar chart of five highest-variance features.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.
    """
    # Get Features
    top_five_features = get_high_var_features(trimmed=trimmed, return_feat_names=False)
    
    # Build Bar Chart
    fig = px.bar(
        x = top_five_features, 
        y = top_five_features.index,
        orientation = "h"
    )
    fig.update_layout(xaxis_title = "Variance", yaxis_title = "Feature")
    return fig

In [9]:
serve_bar_chart(trimmed = True)

In [10]:
def get_model_metrics(trimmed = True, k = 2, return_metrics = False):

    """Build ``KMeans`` model based on five highest-variance features in ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.

    return_metrics : bool, default=False
        If ``False`` returns ``KMeans`` model. If ``True`` returns ``dict``
        with inertia and silhouette score.

    """
    # Get Feature
    features = get_high_var_features(trimmed=trimmed, return_feat_names=True)
    # Create feature matrix
    X = df[features]
    # Build mode
    model = make_pipeline(
            StandardScaler(),
            KMeans(n_clusters=k, random_state= 42, n_init=10)
    )
    # Fit Model
    model.fit(X)
    if return_metrics:
        # calculate inertia
        i = model.named_steps["kmeans"].inertia_
        # calculate silhouette score
        ss = silhouette_score(X, model.named_steps["kmeans"].labels_)
        # put results into dictionary
        metrics = {
            "inertia" : round(i),
            "silhouette": round(ss, 3)
        }
        # return dictionary to user
        return metrics
    
    return model

In [11]:
get_model_metrics(trimmed = True, k = 3, return_metrics=True)

{'inertia': 7191, 'silhouette': 0.704}

In [12]:
@app.callback(
    Output("metrics", "children"),
    Input("trim-button", "value"),
    Input("k-slider", "value")
)
def serve_metrics(trimmed = True, k = 2):

    """Returns list of ``H3`` elements containing inertia and silhouette score
    for ``KMeans`` model.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Get metrics
    metrics = get_model_metrics(trimmed = trimmed, k = k, return_metrics= True)

    # add metrics to html elements
    text = [
        html.H3(f"Inertia : {metrics['inertia']}"),
        html.H3(f"Silhouette Score : {metrics['silhouette']}")
    ]
    return text

In [13]:
serve_metrics(k = 20)

[H3('Inertia : 1456'), H3('Silhouette Score : 0.497')]

In [14]:
def get_pca_labels(trimmed= True, k = 2):

    """
    ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Get Features
    features = get_high_var_features(trimmed=trimmed, return_feat_names=True)
    X = df[features]
    
    # Build Transformer
    transformer = PCA(n_components=2, random_state= 42)
    
    # transform our data
    X_t = transformer.fit_transform(X)
    X_pca = pd.DataFrame(X_t, columns = ["PC1", "PC2"])
    # Add labels
    model = get_model_metrics(trimmed = trimmed, k = k, return_metrics= False)
    X_pca["labels"] = model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values("labels", inplace = True)
    
    
    return X_pca

In [15]:
get_pca_labels().tail()

Unnamed: 0,PC1,PC2,labels
1570,-229796.419844,-14301.836873,1
1571,-229805.583716,-14250.840322,1
1572,-229814.747589,-14199.843771,1
1611,-213724.57142,-39060.460885,1
4417,334191.956229,-186450.064242,1


In [16]:
@app.callback(
    Output("pca-scatter", "figure"),
    Input("trim-button", "value"),
    Input("k-slider", "value")
)
def serve_scatter_plot(trimmed = True, k = 2):

    """Build 2D scatter plot of ``df`` with ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Create scatter plot of `PC2` vs `PC1`
    fig =  px.scatter(
        data_frame = get_pca_labels(trimmed + trimmed, k = k),
        x = "PC1",
        y = "PC2",
        color = "labels",
        title = "PCA Representation of Clusters"
    )
    fig.update_layout(xaxis_title = "PC1", yaxis_title = "PC2")
    
    return fig

In [17]:
serve_scatter_plot(trimmed = True, k = 3)

In [18]:
app.run_server(host="localhost", mode="external")

Dash is running on http://localhost:8050/

Dash app running on http://localhost:8050/
