# Task 1.1: Data understanding

In [None]:
#Used for displaying plots below the cell
%matplotlib inline
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import defaultdict
from scipy.stats.stats import pearsonr

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('customer_supermarket.csv', sep='\t', index_col=0)

In [None]:
df.head()

The dataset seems to contain data about the shopping habits of the customers of a grocery store chain.  
Each row represents an object purchased:  
- BasketID: identifies a batch of items bought during the same shopping session  
- BasketDate: date in which the shopping session took place  
- Sale: represents the value of the item, we need to figure out if it refers to a single item or the item*quantity
- CustomerID: identifies a unique customer
- CustomerCountry: represents the country in which the purchase took place
- ProdID: identifies a unique product for sale
- ProdDescr: describes the product
- Qta: number of items bought with id ProdID

In [None]:
df.info()

In [None]:
len(df.index)

Only ProdDescr and CustomerID contain null values.

In [None]:
df.describe()

The statistics regarding the CustomerID are meaningless since the assignment of an ID is usually done progressively and without having any additional information on the customer.  
We need to fix the data type situation in order to get a better understanding of the data set.

## Data type conversion  
Let's start by checking out the data type that pandas assigns to the attributes, in order to get an idea of the potential problems.

In [None]:
df.dtypes

In [None]:
df.convert_dtypes().dtypes

### CustomerID

CustomerID got converted to a reasonable data type while the others became a generic "string".  
However there is no point in having CustomerID as an int64.

In [None]:
df["CustomerID"] = df["CustomerID"].astype("object")

### BasketDate
Let's convert the BasketDate type from String to datetime, just in case we need to perform some analysis that requires ordinal data.

In [None]:
df.BasketDate = pd.to_datetime(df.BasketDate)

### Sale

The "Sale" attribute is considered a generic object while it should be recognised as a float.  
Let's see why.

In [None]:
df.Sale.map(type)

In [None]:
df.Sale

It seems that Sale uses a comma instead of a point to separate the decimal part, so it is considered a "str" instead of a "float64".  
Let's replace the commas in "Sale" with dots in order to have them be recognised as float64 by pandas.

In [None]:
df.Sale = df.Sale.apply(lambda x: x.replace(',','.'))

In [None]:
df.Sale = df.Sale.astype("float64")

Sale is now correctly identified as a float64.

## Data exploration

### Exploration data frame
Used for exploration purposes but not necessarily useful for clustering.  
Initialised with some additional features that could prove useful.

In [None]:
#Auxiliary df to be used throughout the data understanding phase
df_expl = df.copy()

df_expl["QtaPositive"] = 0
df_expl.loc[df_expl["Qta"] > 0, "QtaPositive"] = 1 #Indicates whether the records Qta is positive

df_expl["SalePositive"] = 0
df_expl.loc[df_expl["Sale"] > 0, "SalePositive"] = 1 #Indicates whether the records Sale is positive

df_expl.head()

### BasketID

#### Regarding the different types of BasketID
Let's check why BasketID is not considered an int64 like CustomerID.

In [None]:
nonNumSeries = pd.to_numeric(df.BasketID, errors='coerce').isnull()
# Print the records with BasketIDs containing a non-numeric value
df[nonNumSeries].head()

In [None]:
df.loc[nonNumSeries, "BasketID"].str.slice(0,1).unique()

It seems that a good chunk of the BasketID values start with a "C" and some with "A" instead of being just numbers.  

In [None]:
basket_c_df = df.loc[df["BasketID"].str.get(0) == "C"]
len_basket_c = len(basket_c_df)
print(f"Records starting with 'C' (Size: {len_basket_c}):\n")
basket_c_df.head(5)

In [None]:
basket_a_df = df.loc[df["BasketID"].str.get(0) == "A"]
len_basket_a = len(basket_a_df)
print(f"Records starting with 'A' (Size: {len_basket_a}):\n")
basket_a_df.head(10)

There seems to be a strong correlation between the "C" and a negative quantity, this could indicate a customer that asked for a refund.  

There is also some interesting correlation between the "A" start and a ProdDescr containing "Adjust bad debt", maybe the "A" stands for adjust and since the CustomerID in both cases is NaN this could be an operation that concerns only the management of the shop and not something that concerns the customers (which is our primary objective).  
These records, however, are too few to be meaningful, they skew too much the characteristics of the sale data (outliers) and they don't concern the activities of the customers.

Let's try to add a "BasketID type A" and "BasketID type C" binary attribute (0/1) and see if there are correlations.

In [None]:
#Initialise all the cells to 0
df_expl["BasketIDTypeA"] = 0
df_expl["BasketIDTypeC"] = 0

#Set the cells appropriately depending on the BasketID type
df_expl.loc[df["BasketID"].str.get(0) == "A", "BasketIDTypeA"] = 1
df_expl.loc[df["BasketID"].str.get(0) == "C", "BasketIDTypeC"] = 1

df_expl["NewBasketID"] = df_expl["BasketID"]

#Remove the initial letter from BasketID where necessary
df_expl.loc[df_expl["BasketID"].str.get(0) == "A", "NewBasketID"] = df_expl.loc[(df_expl["BasketID"].str.get(0) == "A"), "BasketID"].str.slice(start=1)
df_expl.loc[df_expl["BasketID"].str.get(0) == "C", "NewBasketID"] = df_expl.loc[(df_expl["BasketID"].str.get(0) == "C"), "BasketID"].str.slice(start=1)

df_expl.corr()

The BasketID of type C has a strong negative correlation with the sign of Qta.

In [None]:
df.loc[df["BasketID"].str.get(0) == "C", "ProdDescr"].unique()

What could this mean for the C type? Probably indicates discounts/refunds.

In [None]:
df_expl["NewBasketID"] = df_expl["NewBasketID"].astype("int64")
df_expl.info()

We notice that there are no more anomalies inside BasketID since it can be now converted to int64.

In [None]:
df_expl["NewBasketID"] = df_expl["NewBasketID"].astype("string")

Let's check if we now have less unique BasketIDs in our records, after removing the letter that identifies the type from the BasketID attribute.

In [None]:
print(f'The original number of unique BasketIDs is: {df_expl["BasketID"].unique().size}')
print(f'The current number of unique BasketIDs is: {df_expl["NewBasketID"].unique().size}')

The number is the same, therefore each BasketID of type A or C didn't merge with pre-existing shopping sessions.  
We can therefore replace the old naming scheme with the new one which doesn't contain letters.

In [None]:
df_expl["BasketID"] = df_expl["NewBasketID"]
df_expl = df_expl.drop("NewBasketID", axis=1)

#### Regarding the merger of type C records into standard records
It could prove useful to take into account the BasketDate and see if it would make sense to merge the type C records with the ones referencing the same item in a previous order from the same customer. (TODO)

### BasketDate
Let's see how the entries are distributed over time.

In [None]:
k = math.ceil(math.log(len(df["BasketID"]), 2) + 1) #Sturge's rule
df.groupby(by=["BasketID"]).nth(0)["BasketDate"].hist(bins=k, figsize=(10,5))
plt.title('Distribution of BasketIDs with respect to years')
plt.show()

The number of transactions increases month by month.

In [None]:
days = sorted(df["BasketDate"].dt.weekday.unique())
days_dict = dict(zip([0,1,2,3,4,5,6], ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]))

basketday = df.groupby(by=["BasketID"]).nth(0)["BasketDate"].dt.weekday #Series containing the day of the week of each BasketID
basketday = basketday.sort_values().transform(lambda x: days_dict[x])
basketday.hist(bins=np.arange(0, len(days)+1, 1), figsize=(10,5))
plt.title('Distribution of BasketIDs with respect to weekday')
plt.show()

Saturday and sunday seem to be the least active days for the shops.

In [None]:
months = sorted(df["BasketDate"].dt.month.unique())
months_dict = dict(zip([1,2,3,4,5,6, 7, 8, 9, 10, 11, 12], 
                       ["January", "February", "March", "April", 
                        "May", "June", "July", "August", 
                        "September", "October", "November", "December"]))

basketmonth = df.groupby(by=["BasketID"]).nth(0)["BasketDate"].dt.month #Series containing the day of the week of each BasketID
basketmonth = basketmonth.sort_values().transform(lambda x: months_dict[x])
basketmonth.hist(bins=np.arange(0, len(months)+1, 1), figsize=(15,5))
plt.title('Distribution of BasketIDs with respect to month')
plt.show()

The sales peak happens in November and the least amount of sales occurs in December, presumably the customers plan their Christmas shopping in advance.

In [None]:
#Distributions of Sale and Qta taking into account the BasketDate
fig = plt.figure(figsize=(20, 5)) 
fig_dims = (1, 2)
fig.subplots_adjust(hspace=0.2, wspace=0.2)

plt.subplot2grid(fig_dims, (0, 0))

plt.scatter(df['BasketDate'], 
            df['Sale'], color='g', marker='*', label='Data')
plt.xlabel('BasketDate')
plt.xticks(rotation='vertical')
plt.ylabel('Sale')


plt.subplot2grid(fig_dims, (0, 1))

plt.scatter(df['BasketDate'], 
            df['Qta'], color='g', marker='*', label='Data')
plt.xlabel('BasketDate')
plt.xticks(rotation='vertical')
plt.ylabel('Qta')
plt.show()

### Sale
We need to figure out if the Sale value refers to the cost of a single item or cost of item * Qta

In [None]:
df.sort_values(by="ProdID").head()

It seems that Sale doesn't change if the Qta is changed.

In [None]:
df.corr()

There doesn't seem to be a correlation in general between Sale and Qta, we can therefore Sale is the cost of the single item.

In [None]:
#Visualize the Sale distribution
fig = plt.figure(figsize=(15, 5)) 
fig_dims = (1, 2)
fig.subplots_adjust(hspace=0.2, wspace=0.2)

plt.subplot2grid(fig_dims, (0, 0))
k = math.ceil(math.log(len(df["Sale"]), 2) + 1) #Sturge's rule
df["Sale"].hist(bins=np.arange(0, k, 1))

plt.subplot2grid(fig_dims, (0, 1))
df.boxplot(column=["Sale"])
plt.show()

As expected the vast majority of Sale values are small.  
We need however to check for 0 values since they don't make sense in the contest of Sale and therefore should be considered as missing values.

In [None]:
df.loc[df["Sale"] == 0].size

Almost a quarter of the Sale values are 0, this needs to be fixed in the Data Preparation phase.

### CustomerID

#### Regarding null CustomerIDs
Let's see why the number of non-null CustomerID entries is so low and if there are any interesting properties to be found.

In [None]:
df_expl["CustomerIDNull"] = 0
df_expl.loc[df_expl["CustomerID"].isna(), "CustomerIDNull"] = 1

df_expl.corr()["CustomerIDNull"]

No interesting correlation.  
Let's check if we can retrieve some missing CustomerIDs by using the records referencing the same BasketID.

In [None]:
df_expl.groupby(by="BasketID").filter(lambda x: x["CustomerID"].isna().any() & x["CustomerID"].notna().any()).size

There seems to be no intersection between records with the same BasketID but different CustomerIDNull value.

### Customer country

In [None]:
basketid_country = df.groupby(by=["CustomerCountry"])["BasketID"].nunique()
basketid_country.plot(kind='bar')
plt.show()

The majority of the operations take place in the United Kingdom.  
It could be interesting to however take into account the revenue by country and see which is more profitable relative to the number of orders.

In [None]:
countryList = df["CustomerCountry"].sort_values().unique()
country_df = pd.DataFrame(data=countryList, columns=["Country"])

df["ProductSaleQta"] = df["Sale"]*df["Qta"]

for country in countryList:
    country_df.loc[country_df["Country"] == country, "TotalSale"] = df.loc[df["CustomerCountry"] == country, "ProductSaleQta"].sum()
    country_df.loc[country_df["Country"] == country, "TotalOrders"] = basketid_country[country]

df = df.drop("ProductSaleQta", axis=1)

country_df["AvgSalePerOrder"] = country_df["TotalSale"]/country_df["TotalOrders"]

country_df.sort_values(by="AvgSalePerOrder", ascending=False).head(10)

In [None]:
country_df.corr()

### ProdID

#### ProdID interpretation  
Let's find out why this wasn't converted to a number.

In [None]:
df.loc[df["ProdID"].str.isnumeric(), ("ProdID", "ProdDescr")].value_counts() #Records with ProdIDs containing only numbers

In [None]:
df.loc[df["ProdID"].str.isalpha(), ("ProdID", "ProdDescr")].value_counts() #Records with ProdIDs containing only letters

In [None]:
#Records with ProdID terminating with a letter
term_letter_prodid = df.loc[(df["ProdID"].str.slice(start=-1).str.isalpha()) & (df["ProdID"].str.slice(0, -1).str.isnumeric())]
term_letter_prodid[["ProdID", "ProdDescr"]].sort_values(by="ProdID").value_counts()

The letters seem to represent different variations of the same item.  

Given the diversity and lack of structure of the ProdIDs, as can be seen in the different types listed above, there doesn't seem to be any interesting information to obtain for now. (TODO?)

#### Regarding ProdIDs in type C BasketID records
We would also like to get an idea of the percentage of ProdID referenced in type C records that are also referenced in standard type records.

In [None]:
prodid_c_set = df_expl.loc[df_expl["BasketIDTypeC"] == 1, "ProdID"].sort_values().unique()
prodid_std_set = df_expl.loc[(df_expl["BasketIDTypeC"] == 0) & (df_expl["BasketIDTypeA"] == 0), "ProdID"].sort_values().unique()

intersection_size = len(list(set(prodid_c_set) & set(prodid_std_set)))

intersection_size/len(prodid_c_set)

We can therefore say that the vast majority of items referenced in C type records are also referenced in standard records, further strengthening the discount/refund hypothesis for C records.

### Qta

#### Regarding missing values

In [None]:
k = math.ceil(math.log(len(df["Qta"]), 2) + 1) #Sturge's rule
df["Qta"].hist(bins=np.arange(0, k))
plt.show()

Let's check for 0 values.

In [None]:
df.loc[df["Qta"] == 0].size

There are no records with Qta equal to 0, so we can assume that there are no records with missing values in the feature Qta.

#### Regarding negative Qta and type C BasketID

In [None]:
df_expl.corr()["QtaPositive"]

As noted in the BasketID section there is a strong correlation between the sign of Qta and a BasketID of type C.  
Let's see if there is some interesting distribution in the remaining negative quantities.

In [None]:
neg_not_c = df_expl.loc[(df_expl["Qta"] < 0) & (df_expl["BasketIDTypeC"] == 0)]
neg_not_c.head()

Let's check if the trend of Sale equal 0 continues throughout the subset of records.

In [None]:
neg_not_c["Sale"].describe()

It does.  
Let's check if all CustomerIDs in the subset are Null.

In [None]:
neg_not_c.describe()["CustomerIDNull"]

They are all Null, as can be deduced by the min value.  
It might be a good idea to remove this data in the Data preparation phase, since we don't care about records that do not describe a customer's behaviour.  
This way we will also have a correlation of 1 between the BasketID class C and negative quantities.

#### Regarding negative Qta and standard type BasketID

In [None]:
neg_not_c["ProdDescr"].unique()

Seems to indicate items that became unsellable for various reasons and got removed from the stocks.

In [None]:
neg_not_c.describe()

Notice that they have no Sale value.

# Task 1.2: Data preparation

## Data cleanup

Remove type A BasketID entries since, as noted above, they don't specify a CustomerID and are concerned with the grocery store chain debt.

In [None]:
df = df.drop(df.loc[df["BasketID"].str.get(0) == "A"].index)

Remove records with negative Qta that do not reference a type C BasketID since they do not reference any CustomerID.  
Now all records with negative Qta are of type C.

In [None]:
df = df.drop(df.loc[(df["BasketID"].str.get(0) != "C") & (df["Qta"] < 0)].index)

Since we don't seem to have a way to retrieve the CustomerID in case it is missing let's delete all entries without CustomerID.

In [None]:
df = df.drop(df.loc[df["CustomerID"].isna()].index)

## Missing values

The missing values will be replaced by the median of the Sale values obtained by grouping with respect to ProdID.

In [None]:
df['Sale'] = df.groupby(['ProdID'])['Sale'].transform(lambda x: 
                                                      x.replace(to_replace=0, method='ffill', value = x.median()))

## Remove outliers
Using the IQR method.

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

Remove the outliers in Sale.

In [None]:
fig = plt.figure(figsize=(20, 5)) 
fig_dims = (1, 2)
fig.subplots_adjust(hspace=0.2, wspace=0.2)

plt.subplot2grid(fig_dims, (0, 0))
df.boxplot("Sale")

plt.subplot2grid(fig_dims, (0, 1))
df = df.loc[(df["Sale"] >= Q1["Sale"] - 1.5*IQR["Sale"]) & (df["Sale"] <= Q3["Sale"] + 1.5*IQR["Sale"])]
df.boxplot("Sale")
plt.show()

Remove the outliers in Qta.

In [None]:
fig = plt.figure(figsize=(20, 5)) 
fig_dims = (1, 2)
fig.subplots_adjust(hspace=0.2, wspace=0.2)

plt.subplot2grid(fig_dims, (0, 0))
df.boxplot("Qta")

plt.subplot2grid(fig_dims, (0, 1))
df = df.loc[(df["Qta"] >= Q1["Qta"] - 1.5*IQR["Qta"]) & (df["Qta"] <= Q3["Qta"] + 1.5*IQR["Qta"])]
df.boxplot("Qta")
plt.show()

## Customer features

Let's add some new features into the data frame.  
Starting with the requested ones.

In [None]:
unq_cust_id = df["CustomerID"].sort_values().unique()[0:-1] #Remove NaN value, last value
cust_df = pd.DataFrame(data=unq_cust_id, columns=["CustomerID"]) #Dataframe containing customer features

cust_df["CustomerID"] = cust_df["CustomerID"].astype("object")

#Total number of items bought by customer
IFeature = df.groupby(["CustomerID"]).Qta.sum()
cust_df = cust_df.merge(IFeature, on="CustomerID").rename(columns={"Qta":"I"})

#Total number of unique items bought by customer
IuFeature = df.groupby(["CustomerID"]).ProdID.nunique()
cust_df = cust_df.join(IuFeature, on="CustomerID").rename(columns={"ProdID":"Iu"})

#Max number of item bought by customer across all shopping sessions
BasketIDQtaSum= df.groupby(["CustomerID", "BasketID"]).Qta.sum()
ImaxFeature = BasketIDQtaSum.groupby(["CustomerID"]).max()
cust_df = cust_df.join(ImaxFeature, on="CustomerID").rename(columns={"Qta":"Imax"})

#The Shannon entropy on the purchasing behaviour of the customer: days of the week of shopping
weekday_rank = df.groupby(["CustomerID", df["BasketDate"].dt.weekday]).size()
probSeries = (weekday_rank/(weekday_rank.sum())).rename({"Qta":"P_weekday_customer"})
logSeries = np.log2(probSeries)
entropy = -1*probSeries*logSeries
EFeature = entropy.groupby(["CustomerID"]).sum()
EFeature.name = "Eweekday"
cust_df = cust_df.join(EFeature, on="CustomerID")

Add some additional ones.  
customer shoppings per day (TODO?)

In [None]:
#Average total sale by month
tot_sale_month = df.groupby(["CustomerID", pd.Grouper(key="BasketDate", freq="M")]).apply(lambda x: (x["Sale"]*x["Qta"]).sum())
n_month_cust = tot_sale_month.groupby(["CustomerID"]).size()
AvgFeature = tot_sale_month.groupby(["CustomerID"]).sum()/n_month_cust
AvgFeature.name = "AvgExpendMonth"
cust_df = cust_df.join(AvgFeature, on="CustomerID")

We can now safely drop the CustomerID column.

In [None]:
cust_df = cust_df.drop("CustomerID", axis=1)

## Feature analysis

In [None]:
cust_df.info()

In [None]:
cust_df.corr()

It seems that the higher the number of unique items bought by a customer the higher the variety of weekdays in which a shopping session takes place.  
A similar but weaker relation is present between the total number of items bought and Eweekday, possibly skewed by the correlation between Iu and I.  
It also seems that there is a significant correlation between the max number of items bought and the average expenditure per month of the customer.

In [None]:
pd.plotting.scatter_matrix(cust_df, figsize=(10,10))
plt.show()

Another interesting correlation (0.737407) is the one between the features AvgExpendMonth and Imax, this could suggest to us that the customers with the biggest expenditures tend to buy more items per session (further analysis required). (TODO)

## Remove customer related outliers

In [None]:
Q1 = cust_df.quantile(0.25)
Q3 = cust_df.quantile(0.75)
IQR = Q3 - Q1

In [None]:
#fig_height = cust_df.columns.size
#fig = plt.figure(figsize=(20, 30)) 
#fig_dims = (fig_height, 2)
#fig.subplots_adjust(hspace=0.2, wspace=0.2)
#index = 0

for feature in cust_df.columns:
    #plt.subplot2grid(fig_dims, (index, 0))
    #cust_df.boxplot(feature)

    cust_df = cust_df.loc[(cust_df[feature] >= Q1[feature] - 1.5*IQR[feature]) & (cust_df[feature] <= Q3[feature] + 1.5*IQR[feature])]
    
    #plt.subplot2grid(fig_dims, (index, 1))
    #cust_df.boxplot(feature)
    #index = index + 1
    
#plt.show()

Final result

In [None]:
pd.plotting.scatter_matrix(cust_df, figsize=(10,10))
plt.show()

In [None]:
cust_df.corr()

The correlation between Iu and Eweekday is even stronger after removing the outliers.  
In order to reduce the dimensionality of the data we will drop Eweekday given the strong similarity to Iu.

In [None]:
cust_df = cust_df.drop("Eweekday", axis=1)

# Task 2: Clustering

## Normalization  
Z-scaler

In [None]:
scaler = StandardScaler()
scaler.fit(cust_df.values)
X = scaler.transform(cust_df.values)

## K-Means

Let's use the Knee method to find the best k.

In [None]:
sse_list = list()
max_k = 40
for k in range(2, max_k + 1): #Starting from k=2 to k=40
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(X)
    
    sse = kmeans.inertia_
    sse_list.append(sse)

In [None]:
plt.plot(range(2, len(sse_list) + 2), sse_list)
plt.ylabel('SSE', fontsize=22)
plt.xlabel('K', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

It seems that around K=10 ± 5 we get diminishing returns.

In [None]:
kmeans = KMeans(n_clusters=10, n_init=10, max_iter=100)
kmeans.fit(X)
centers = scaler.inverse_transform(kmeans.cluster_centers_)

In [None]:
hist, bins = np.histogram(kmeans.labels_, 
                          bins=range(0, len(set(kmeans.labels_)) + 1))
dict(zip(bins, hist))

In [None]:
pd.plotting.scatter_matrix(cust_df, c=kmeans.labels_, figsize=(20,20))
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
for i in range(0, len(centers)):
    plt.plot(centers[i], marker='o', label='Cluster %s' % i)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.xticks(range(0, len(cust_df.columns)), cust_df.columns, fontsize=18)
plt.legend(fontsize=10)
plt.show()


In [None]:
# number of variable
N = len(cust_df.columns)
# What will be the angle of each axis in the plot? (we divide the plot / number of variable)

# Initialise the spider plot
plt.figure(figsize=(10, 10))
ax = plt.subplot(polar=True)

for i in range(0, len(centers)):
    angles = [n / float(N) * 2 * math.pi for n in range(N)]
    values = centers[i].tolist()
    values += values[:1]
    angles += angles[:1]
    
# Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], cust_df.columns, color='grey', size=8) 
# Plot data
    ax.plot(angles, values, linewidth=1, linestyle='solid')
 # Fill area
    ax.fill(angles, values, 'b', alpha=0.1)

### PCA analysis

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X) #We need the normalised data
print(pca.explained_variance_ratio_) #Variance explained by the components

We are retaining approximately 86% of the variance.  
Let's apply the elbow method to the PCA transformed data.

In [None]:
sse_list = list()
max_k = 40
for k in range(2, max_k + 1): #Starting from k=2 to k=40
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(X_pca)
    
    sse = kmeans.inertia_
    sse_list.append(sse)

In [None]:
plt.plot(range(2, len(sse_list) + 2), sse_list)
plt.ylabel('SSE', fontsize=22)
plt.xlabel('K', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

The optimal value is still k=10.

In [None]:
pca_kmeans = KMeans(n_clusters=10, n_init=10, max_iter=100)
pca_kmeans.fit(X_pca)

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=pca_kmeans.labels_, edgecolor='k', s=40)
plt.title("PCA")
plt.xlabel("1st eigenvector")
plt.ylabel("2nd eigenvector")
plt.show()

The clusters are well separated.  
We now have a new categorical feature to help us analyse the pre-PCA dataset.