<a href="https://colab.research.google.com/github/carmenpelayo/Carmen-Pelayo/blob/main/Spherical_KMeans_with_Cosine_Distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# "Spherical KMeans  with Cosine Distance"
> Implementing kmeans with cosine distance

# Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm

# Algorithm
The following shows my kmeans implementation. The steps are as follows:
1. Choose `K_clusters` points from our dataset randomly and set them as our initial centroids.
2. Iterate through all datapoints and assign each point to one of the centroids.
3. Recalculate centroids based by averaging datapoints assigned to each cluster. As an additional step to usual kmeans, normalize to unit length.
4. Repeat from step 2, for `epochs` iterations.

In [None]:
class SKMeans:

    def __init__(self, K_clusters, epochs = 10, alpha = 0.05):
        self.K_clusters = K_clusters
        self.epochs = epochs
        self.W = None
        self.index = None
        self.similarities = []  
        self.std = []  
        self.nx = None
        self.alpha = alpha


    def fit(self, X, epochs = None):

        # Initialization
        d = X.size(dim=1)
        n = X.size(dim=0)
        if not self.W:
            idx = np.random.choice(len(X), self.K_clusters, replace=False)
            W = X[idx]
            W = F.normalize(W)
            nx = (1/self.K_clusters)*torch.ones(self.K_clusters)
        else:
            W = self.W
            nx = self.nx
        if not epochs:
            epochs = self.epochs       

        # Train   
        for ep in tqdm(range(epochs)):
            Z = X @ W.T
            Zx, Ix = Z.max(dim=-1)
            S = 1.*torch.eq(Zx,Z.T)
            W = S @ X
            W = F.normalize(W)
            St = Zx.sum()/n # Similarity @ t
            #std_n = torch.std(S.sum(dim=1)/n)
            self.similarities.append(St.item())
            #self.std.append(std_n.T.item())            
            #print('Similarity:', St.item())       
        self.index = Ix
        self.W = W
        self.nx = S.sum(dim=1)/n     

# Regions

In this section database `regions.xlsx` will be employed to create 8 clusters of regions in Europe. This database analyzes 270 NUTS-2 Regions in 21 socio-economic parameters. 

In [None]:
import pandas as pd
reg_vect = pd.read_excel('regions.xlsx', sheet_name = 'vectors')
reg_info = pd.read_excel('regions.xlsx', sheet_name = 'info')

In [None]:
X = reg_vect.to_numpy()[:,1:]
X = X.astype(np.float64)
Xn = torch.from_numpy(X)
Xn = Xn-torch.mean(Xn,0)
Xn = Xn/torch.std(Xn,0)
Xn = Xn.type(torch.FloatTensor)

skmeans = SKMeans(8)
skmeans.fit(Xn)
plt.figure(1)
plt.plot(skmeans.similarities)

In [None]:
reg_vect['index'] = skmeans.index
reg_index = reg_vect[['NUTS 2 Code','index']]
reg_index.sort_values(by=['index'], inplace=True)
reg_index = reg_index.merge(reg_info, how='left', left_on='NUTS 2 Code', right_on='Region')
reg_index = reg_index[['index', 'Region', 'Region Name', 'geometry', 'Country Name']]
reg_index['Region'].replace('', np.nan, inplace=True)
reg_index.dropna(subset=['Region'], inplace=True)
# Rename single columns
reg_index = reg_index.rename(columns={'index' : 'Cluster', 'Region': 'NUTS_ID', 'geometry': 'center'})
reg_index.to_excel("reg_index.xlsx", index=False)
print(reg_index)

In [None]:
!pip install geemap
!pip install geopandas

In [None]:
import geopandas as gpd
from matplotlib import pyplot as plt

In [None]:
nuts=gpd.read_file('NUTS_RG_20M_2021_4326-2.json')
nuts2 = nuts[nuts.LEVL_CODE == 2]
df_vects = pd.read_excel('reg_index.xlsx')
df = pd.merge(nuts2, df_vects, how='inner', on='NUTS_ID')
df.plot('Cluster', cmap="rainbow",    figsize=(15, 10)) 