<a href="https://colab.research.google.com/github/apriandito/workshop-fmcg-2/blob/main/python/002_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Muhammad Apriandito - FMCG Workshop*


---



### **Load Packages and Modules**

In [None]:
# Load Packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load Modules
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

### **Load Data**

In [None]:
# Load Data
df  = pd.read_csv("https://raw.githubusercontent.com/apriandito/workshop-fmcg-2/main/data/nutrition.csv")

In [None]:
# Show Data
df

In [None]:
# Show Data information
df.info()

In [None]:
# Select tenure and monthly chareges variable for clustering
df_cluster = df[['calories', 'saturated_fat']]
df_cluster 

In [None]:
# Visualizing the Data
sns.scatterplot(x='calories', y='saturated_fat', data = df_cluster)

#### **Pre-Processing**

In [None]:
# Set Name for StandardScaler as scaler
scaler = StandardScaler() 

# Fit Standardization
column_names = df_cluster.columns.tolist()
df_cluster[column_names] = scaler.fit_transform(df_cluster[column_names])
df_cluster.sort_index(inplace=True)
df_cluster

#### **Search for the Optimum Number of Clusters (k)**

In [None]:
# Transform Data Frame to Numpy Array
np_cluster = df_cluster.to_numpy()
np_cluster

# Elbow Method
from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(np_cluster)
    wcss.append(kmeans.inertia_)
  
# Visualize 
plt.plot(range(1,11),wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('wcss')
plt.show()

In [None]:
for n_cluster in range(2, 11):
    kmeans = KMeans(n_clusters=n_cluster).fit(np_cluster)
    label = kmeans.labels_
    sil_coeff = silhouette_score(np_cluster, label, metric='euclidean')
    print('For n_clusters={}, The Silhouette Coefficient is {}'.format(n_cluster, sil_coeff))

#### **K-Means Clustering**

In [None]:
# Apply the K-Means Model to the Data
kmeans = KMeans(n_clusters=2, init='k-means++')
cluster = kmeans.fit_predict((np_cluster))

In [None]:
# Add Cluster Information to the Raw Data
df_cluster['cluster'] = cluster
df_cluster

In [None]:
# Visualize
sns.scatterplot(x='calories',
                y='saturated_fat',
                hue="cluster", 
                data=df_cluster)