In [2]:
# Required imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Load the Data Into a Pandas DataFrame

In [3]:
# Read in the CSV file as a Pandas Dataframe
ccinfo_default_df = pd.read_csv(
    Path("../Resources/ccinfo_transformed.csv")
)

ccinfo_default_df.head()

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary,customer_segments
0,-1.117341,1,24,-0.660703,-0.542779,1,0,0,0,1,2
1,-0.349942,0,26,-0.63637,-0.463994,1,0,0,0,1,2
2,-0.580162,0,34,-0.416808,-0.354013,0,0,0,0,1,0
3,-0.887121,1,37,-0.080152,-0.402077,0,0,0,0,1,0
4,-0.887121,1,57,-0.396855,0.523771,0,0,0,0,1,1


In [4]:
# Plot the clusters using the "limit_bal" and "age" columns
ccinfo_default_df.hvplot.scatter(
    x="limit_bal",
    y="age",
    by="customer_segments"
)

In [5]:
# Plot the clusters using the "bill_amt" and "pay_amt" columns
ccinfo_default_df.hvplot.scatter(
    x="bill_amt",
    y="pay_amt",
    by="customer_segments"
)

## Use PCA to reduce the number of factors 

In [6]:
# Import the PCA module
from sklearn.decomposition import PCA

In [7]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=2)

In [8]:
# Fit the PCA model on the transformed credit card DataFrame
ccinfo_pca = pca.fit_transform(ccinfo_default_df)

# Review the first 5 rows of list data
ccinfo_pca[:5]

array([[-11.4106317 ,  -1.19426208],
       [ -9.424725  ,  -0.75732157],
       [ -1.33620686,  -0.69534399],
       [  1.67884463,  -0.76676318],
       [ 21.58943237,  -0.9373152 ]])

## PCA explained variance ratio

In [9]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.95017303, 0.01898131])

## Creating the PCA DataFrame

In [13]:
# Create the PCA DataFrame
ccinfo_pca_df = pd.DataFrame(
    ccinfo_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
ccinfo_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-11.410632,-1.194262
1,-9.424725,-0.757322
2,-1.336207,-0.695344
3,1.678845,-0.766763
4,21.589432,-0.937315


## Incorporating the PCA DataFrame into the elbow method

In [14]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(ccinfo_pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,439527.527904
1,2,141348.205462
2,3,71950.112226
3,4,47104.159733
4,5,34598.959866


In [15]:
# Plot the Elbow Curve
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

## Segmention of the PCA data with Kmeans 

In [16]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(ccinfo_pca_df)

# Make predictions
k_3 = model.predict(ccinfo_pca_df)

# Create a copy of the PCA DataFrame
ccinfo_pca_predictions_df = ccinfo_pca_df.copy()

# Add a class column with the labels
ccinfo_pca_predictions_df["customer_segments"] = k_3

In [17]:
# Plot the clusters
ccinfo_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="customer_segments"
)