In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import trimmed_var

In [2]:
def wrangle(filepath):
    
    df = pd.read_csv(filepath)
    
    mask = df["TURNFEAR"] == 1
    df_fear = df[mask]
    
    return df_fear

In [3]:
df = wrangle("SCFP2019.csv")
print("df shape:", df.shape)
df.head()

df shape: (4623, 351)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
5,2,21,3790.476607,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
6,2,22,3798.868505,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,3,2,2
7,2,23,3799.468393,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
8,2,24,3788.076005,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
9,2,25,3793.066589,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2


In [4]:
top_ten_var = df.var().sort_values().tail(10)
top_ten_var

FIN          1.112439e+13
NONACTBUS    2.352287e+13
ACTBUS       1.165610e+14
KGBUS        1.880054e+14
KGTOTAL      2.062122e+14
BUS          2.299970e+14
NHNFIN       2.343946e+14
NFIN         2.584721e+14
NETWORTH     3.053394e+14
ASSET        3.166168e+14
dtype: float64

In [5]:
fig = px.bar(
    x = top_ten_var,
    y = top_ten_var.index,
    title = "SCF: High Variance Features"
)
fig.update_layout(xaxis_title = "Variance",
                 yaxis_title = "Feature")
fig.show()

In [6]:
fig = px.box(
    data_frame = df,
    x = "NHNFIN",
    title = "Distribution of Non Home Non Finanacial Assets"
)
fig.update_layout(xaxis_title = "Value [$]")
fig.show()

In [7]:
top_ten_trimmed_var = df.apply(trimmed_var, limits = (0.1, 0.1)).sort_values().tail(10)
top_ten_trimmed_var

KGTOTAL     6.672366e+08
HOMEEQ      1.221363e+09
NH_MORT     1.881556e+09
MRTHEL      1.996548e+09
PLOAN1      2.081083e+09
DEBT        4.020928e+09
NETWORTH    6.770975e+09
HOUSES      7.256238e+09
NFIN        1.334526e+10
ASSET       2.019591e+10
dtype: float64

In [8]:
fig = px.bar(
    x = top_ten_trimmed_var,
    y = top_ten_trimmed_var.index,
    title = "SCF: High Variance Features"
)
fig.update_layout(xaxis_title = "Trimmed Variance",
                 yaxis_title = "Feature")
fig.show()

In [9]:
high_var_cols = top_ten_trimmed_var.tail(5).index.tolist()
high_var_cols

['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET']

In [10]:
X = df[high_var_cols]
print("X shape: ", X.shape)
X.head()

X shape:  (4623, 5)


Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
5,12200.0,-6710.0,0.0,3900.0,5490.0
6,12600.0,-4710.0,0.0,6300.0,7890.0
7,15300.0,-8115.0,0.0,5600.0,7185.0
8,14100.0,-2510.0,0.0,10000.0,11590.0
9,15400.0,-5715.0,0.0,8100.0,9685.0


In [11]:
X_summary = X.aggregate(["mean", "std"]).astype(int)
X_summary

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
mean,126943,1418463,190798,1238763,1545407
std,696321,17473964,1365501,16077069,17793728


In [12]:
ss = StandardScaler()
X_scaled_data = ss.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled_data, columns = X.columns)
print("X_scaled shape: ", X_scaled.shape)
X_scaled.head()

X_scaled shape:  (4623, 5)


Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,-0.164803,-0.081569,-0.139743,-0.076817,-0.086552
1,-0.164229,-0.081454,-0.139743,-0.076668,-0.086417
2,-0.160351,-0.081649,-0.139743,-0.076712,-0.086457
3,-0.162075,-0.081328,-0.139743,-0.076438,-0.086209
4,-0.160207,-0.081512,-0.139743,-0.076556,-0.086316


In [13]:
X_scaled_summary = X_scaled.aggregate(["mean", "std"]).astype(int)
X_scaled_summary

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
mean,0,0,0,0,0
std,1,1,1,1,1


In [14]:
n_clusters = range(2, 20)
inertia_errors = []
silhouette_scores = []

for k in n_clusters:
    model = make_pipeline(
        StandardScaler(),
        KMeans(n_clusters= k, random_state= 42, n_init=10)
    )
    model.fit(X)
    
    inertia_errors.append(model.named_steps["kmeans"].inertia_)
    silhouette_scores.append(silhouette_score(X, model.named_steps["kmeans"].labels_))
    print("inertia: ", inertia_errors[:5])
    print("silouette scores: ", silhouette_scores[:5])

inertia:  [7789.322200524557]
silouette scores:  [0.9926495190563632]
inertia:  [7789.322200524557, 3498.194354908089]
silouette scores:  [0.9926495190563632, 0.9859234178309856]
inertia:  [7789.322200524557, 3498.194354908089, 1926.0552997317973]
silouette scores:  [0.9926495190563632, 0.9859234178309856, 0.9752632913468436]
inertia:  [7789.322200524557, 3498.194354908089, 1926.0552997317973, 1070.1267437019674]
silouette scores:  [0.9926495190563632, 0.9859234178309856, 0.9752632913468436, 0.9530646375479627]
inertia:  [7789.322200524557, 3498.194354908089, 1926.0552997317973, 1070.1267437019674, 738.5478980328585]
silouette scores:  [0.9926495190563632, 0.9859234178309856, 0.9752632913468436, 0.9530646375479627, 0.9111091653617736]
inertia:  [7789.322200524557, 3498.194354908089, 1926.0552997317973, 1070.1267437019674, 738.5478980328585]
silouette scores:  [0.9926495190563632, 0.9859234178309856, 0.9752632913468436, 0.9530646375479627, 0.9111091653617736]
inertia:  [7789.32220052455

In [15]:
fig = px.line(
    x = n_clusters,
    y = inertia_errors,
    title = "K Means Model: Inertia Errors Vs. n_clusters"
)
fig.update_layout(xaxis_title = "Number of Clusters",
                 yaxis_title = "Inertia Errors")
fig.show()

In [16]:
fig = px.line(
    x = n_clusters,
    y = silhouette_scores,
    title = "K Means Model: Silhouette Score Vs. n_clusters"
)
fig.update_layout(xaxis_title = "Number of Clusters",
                 yaxis_title = "Silhouette Scores")
fig.show()

In [17]:
final_model = make_pipeline(
        StandardScaler(),
        KMeans(n_clusters= 4, random_state=42, n_init=10)
)
final_model.fit(X)

In [18]:
labels = final_model.named_steps["kmeans"].labels_
print(labels[:5])

[0 0 0 0 0]


In [19]:
xgb = X.groupby(labels).mean()
xgb

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,84813.45,319608.5,105136.6,257540.6,404421.9
1,5091600.0,478452800.0,15114000.0,446141400.0,483544400.0
2,2040775.0,52158290.0,3832000.0,40797150.0,54199070.0
3,18384100.0,123478600.0,34484000.0,137295300.0,141862700.0


In [20]:
fig = px.bar(
    xgb,
    barmode = "group",
    title = "Mean Household Finances by Cluster"
)
fig.update_layout(xaxis_title = "Cluster",
                 yaxis_title = "Value [$]")
fig.show()

In [21]:
pca = PCA(n_components=2, random_state=42)
X_t = pca.fit_transform(X)
X_pca = pd.DataFrame(X_t, columns = (["PC1", "PC2"]))
X_pca.head()

Unnamed: 0,PC1,PC2
0,-2437454.0,-64260.754991
1,-2433535.0,-64628.385762
2,-2436314.0,-65931.736547
3,-2428004.0,-65636.329883
4,-2432048.0,-66156.526393


In [22]:
fig = px.scatter(
    data_frame = X_pca,
    x = "PC1",
    y = "PC2",
    color = labels.astype(str),
    title = "PCA Representation of Clusters"
)
fig.update_layout(xaxis_title = "PC1",
                 yaxis_title = "PC2")
fig.show()