In [62]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [63]:
# Preprocessing data with Pandas
file_path = "/Users/annettedblackburn/Desktop/Data_Analytics_Bootcamp/Module 18 - Unsupervised Machine Learning and Cryptocurrencies/Module/shopping_data.csv"
df_shopping = pd.read_csv(file_path, encoding="ISO-8859-1")
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [64]:
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [65]:
df_shopping.count()

CustomerID                203
Card Member               201
Age                       201
Annual Income             203
Spending Score (1-100)    202
dtype: int64

In [66]:
# Columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [67]:
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [68]:
# Find null values
for column in df_shopping.columns:
    print(f"Column{column} has {df_shopping[column].isnull().sum()} null values.")

ColumnCustomerID has 0 null values.
ColumnCard Member has 2 null values.
ColumnAge has 2 null values.
ColumnAnnual Income has 0 null values.
ColumnSpending Score (1-100) has 1 null values.


In [69]:
# Drop null rows
df_shopping = df_shopping.dropna()
df_shopping.count()

CustomerID                200
Card Member               200
Age                       200
Annual Income             200
Spending Score (1-100)    200
dtype: int64

In [70]:
# Find duplicate entries
print(f"Duplicate entries: {df_shopping.duplicated().sum()}")

Duplicate entries: 0


In [71]:
# Remove CustomerID columns
df_shopping.drop(columns=["CustomerID"], inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [72]:
# Transform string column
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0

df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [73]:
# Transform annual income
df_shopping["Annual Income"] = df_shopping["Annual Income"]/100
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,150.0,39.0
1,1,21.0,150.0,81.0
2,0,20.0,160.0,6.0
3,0,23.0,160.0,77.0
4,0,31.0,170.0,40.0


In [74]:
file_path = "/Users/annettedblackburn/Desktop/Data_Analytics_Bootcamp/Module 18 - Unsupervised Machine Learning and Cryptocurrencies/Module/shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)

In [86]:
# Load data
file_path = "/Users/annettedblackburn/Desktop/Data_Analytics_Bootcamp/Module 18 - Unsupervised Machine Learning and Cryptocurrencies/Module/shopping_data_cleaned.csv"
cleaned_df_shopping = pd.read_csv(file_path)
cleaned_df_shopping.head(10)

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,150.0,39.0
1,1,21.0,150.0,81.0
2,0,20.0,160.0,6.0
3,0,23.0,160.0,77.0
4,0,31.0,170.0,40.0
5,0,22.0,170.0,76.0
6,0,35.0,180.0,6.0
7,0,23.0,180.0,94.0
8,1,64.0,190.0,3.0
9,0,30.0,190.0,72.0


In [76]:
# Use the new column names
cleaned_df_shopping.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)")

In [77]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to df_iris
    df["class"] = model.labels_

In [78]:
test_cluster_amount(cleaned_df_shopping, 2)
cleaned_df_shopping.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [80]:
fig = px.scatter_3d(
    cleaned_df_shopping,
    x="Annual Income",
    y="Spending Score (1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [82]:
# Other cluster counts
test_cluster_amount(cleaned_df_shopping, 7)
cleaned_df_shopping.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [85]:
fig = px.scatter_3d(
    cleaned_df_shopping,
    x="Annual Income",
    y="Spending Score (1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()