# **TRAINING A CLUSTER MODEL**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

df = pd.read_excel(r'Data\Task1and2\train.xlsx')
df = df.drop(columns=['target'])
print(df.head())

   T1  T2  T3  T4  T5  T6  T7  T8  T9  T10  T11  T12  T13  T14  T15  T16  T17  \
0 -70 -61 -66 -53 -51 -63 -82 -57 -76  -78  -66  -66  -61  -59  -73  -75  -63   
1 -77 -74 -71 -76 -65 -63 -66 -52 -55  -75  -72  -75  -74  -61  -64  -63  -53   
2 -53 -38 -55 -66 -62 -62 -65 -70 -62  -52  -56  -53  -66  -68  -72  -60  -68   
3 -72 -62 -59 -65 -65 -65 -78 -82 -83  -59  -84  -60  -64  -83  -69  -72  -95   
4 -67 -69 -65 -63 -59 -53 -70 -72 -71  -60  -61  -57  -54  -76  -61  -66  -71   

   T18  
0  -77  
1  -63  
2  -77  
3  -73  
4  -80  


In [21]:
from sklearn.metrics import silhouette_score

# Number of clusters
k = 180

# Creating KMeans model with 5 clusters
kmeans = KMeans(n_clusters=k, random_state=42)
# Fitting the model to the data
kmeans.fit(df)

# Printing silhouette score
silhouette_avg = silhouette_score(df, kmeans.labels_)
print(f"Silhouette Score for {k} clusters:", silhouette_avg)

# Adding a new column to the DataFrame indicating the cluster for each row
df['Cluster'] = kmeans.labels_

# Printing DataFrame with cluster information
print(df)

Silhouette Score for 180 clusters: 0.6795622751155713
       T1  T2  T3  T4  T5  T6  T7  T8  T9  T10  T11  T12  T13  T14  T15  T16  \
0     -70 -61 -66 -53 -51 -63 -82 -57 -76  -78  -66  -66  -61  -59  -73  -75   
1     -77 -74 -71 -76 -65 -63 -66 -52 -55  -75  -72  -75  -74  -61  -64  -63   
2     -53 -38 -55 -66 -62 -62 -65 -70 -62  -52  -56  -53  -66  -68  -72  -60   
3     -72 -62 -59 -65 -65 -65 -78 -82 -83  -59  -84  -60  -64  -83  -69  -72   
4     -67 -69 -65 -63 -59 -53 -70 -72 -71  -60  -61  -57  -54  -76  -61  -66   
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ...  ...  ...  ...   
36747 -68 -62 -64 -59 -59 -71 -67 -73 -73  -69  -71  -75  -56  -68  -68  -75   
36748 -56 -50 -57 -50 -59 -60 -62 -69 -58  -64  -49  -61  -58  -56  -60  -58   
36749 -42 -58 -72 -70 -75 -72 -76 -84 -84  -59  -65  -68  -65  -81  -86  -70   
36750 -50 -50 -70 -64 -64 -68 -68 -73 -79  -58  -61  -73  -67  -77  -73  -74   
36751 -66 -64 -70 -62 -59 -62 -53 -58 -64  -80  -73  -83  -67  -61

In [22]:
import plotly.express as px
from sklearn.decomposition import PCA

# Reduce dimensionality using PCA
pca = PCA(n_components=3, random_state=42)
X_pca = pca.fit_transform(df.drop(columns=['Cluster']))  # Dropping the 'Cluster' column

# Create a DataFrame with PCA components and cluster labels
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2', 'PC3'])
pca_df['Cluster'] = df['Cluster']

# Plot clusters in 3D using Plotly
fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Cluster', opacity=0.7,
                    title='PCA Visualization of Clusters', labels={'Cluster': 'Cluster'})
fig.update_layout(scene=dict(xaxis_title='PC1', yaxis_title='PC2', zaxis_title='PC3'))

# Save the Plotly figure as an HTML file
fig.write_html('pca_clusters.html')