# K-Means Clustering


In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from collections import Counter
import seaborn as sns
import os
%matplotlib inline

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['OnlineNewsPredition_Reduced.csv']))

In [None]:
list(df.columns)

In [None]:
# Generate new data frame removing the sepcified variables

df_num = df.drop(["shares"],axis = 1)
list(df_num.columns)

In [None]:
sse = []
k_rng = range(1,10)
for k in k_rng:
    km = KMeans(n_clusters=k)
    km.fit(df_num)
    sse.append(km.inertia_)

In [None]:
plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.plot(k_rng,sse)# Create a new dataframe to add the new cluster variable
df_num1 = df_num

In [None]:
# create a KMeans object with k=4
km = KMeans(n_clusters=4)

# fit the KMeans object to the dataset assigning to y_predicted
y_predicted = km.fit_predict(df_num)
y_predicted

In [None]:
# Create a new dataframe to add the new cluster variable
df_num_cluster = df_num

In [None]:
# Add the cluster value in new variable 'cluster' in the main dataset 'df1'

df_num_cluster['cluster']=y_predicted
df_num_cluster.head()

In [None]:
#gives predicted class labels (cluster) for each data point
km.labels_

### Check the total count of observation belongs to specific cluster


In [None]:
Counter(km.labels_)

# Analysing the underlying features in clusters

In [None]:
# Fit the KMeans object to the dataset (assuming df_num is your dataset)
k = 4
km = KMeans(n_clusters=k, n_init=10) # we put n_init=10 to not change the centroids 
km.fit(df_num) 

In [None]:
# Extract the centroids
centroids = km.cluster_centers_

# Create a DataFrame with the centroids
centroids_df = pd.DataFrame(centroids, columns=df_num.columns)


In [None]:
print("Centroids:")
print(centroids_df)

In [None]:
# Perform PCA to reduce dimensionality for visualization
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(df_num)

## Visualizing clusters

In [None]:
# Create a DataFrame for reduced data and cluster labels
results = pd.DataFrame(reduced_data, columns=['pca1', 'pca2'])
results['cluster'] = km.labels_


In [None]:
# Set the style and color palette
sns.set(style="whitegrid")
color_palette = sns.color_palette("husl", k)

# Create a scatter plot with improved colors
plt.figure(figsize=(10, 6))

for i in range(k):
    plt.scatter(
        results[results['cluster'] == i]['pca1'],
        results[results['cluster'] == i]['pca2'],
        label=f'Cluster {i+1}',
        c=[color_palette[i]],
        alpha=0.6,
        edgecolors='black',
        linewidth=1
    )

# Add centroids to the scatter plot
reduced_centroids = pca.transform(centroids)
for i in range(k):
    plt.scatter(reduced_centroids[i, 0], reduced_centroids[i, 1], marker='*', s=200, c=[color_palette[i]], label=f'Centroid {i+1}')

plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('K-Means Clustering Visualization with PCA')
plt.legend()
plt.show()
