In [208]:
from scipy.cluster.hierarchy import fclusterdata,ward
from scipy.spatial.distance import pdist
from scipy.stats import spearmanr
import numpy as np

In [49]:
X = [[0, 0], [0, 1], [1, 0],
...      [0, 4], [0, 3], [1, 4],
...      [4, 0], [3, 0], [4, 1],
...      [4, 4], [3, 4], [4, 3]]
X = np.asarray(X)

In [211]:
x = [1, 2, 3]
x_corr = [2, 4, 6]
corr, p_value = spearmanr(x, x_corr)
p_value

0.0

In [213]:
import random
import math
def normalized(ar):
    return (ar - np.average(ar))/np.std(ar)

def cosine_similarity(ar1, ar2):
    numerator = np.sum(ar1 * ar2)
    denominator = np.sqrt(np.sum(ar1 * ar1) * np.sum(ar2 * ar2))
    if denominator != 0:
        return numerator / denominator
    else:
        return 0
    
def eucldist(p0,p1):
    dist = 0.0
    for i in range(0,len(p0)):
        dist += (p0[i] - p1[i])**2
    return math.sqrt(dist)

def manhattan(p0,p1):
    dist =0.0
    for i in range(0,len(p0)):
        dist += abs(p0[i]-p1[i])
    return dist

def chebysheb(p0,p1):
    dist = 0.0
    for i in range(0,len(p0)):
        dist = max(p0[i]-p1[i],dist)
    return dist

def pearson(p0,p1):
    return 1 - cosine_similarity(normalized(p0),normalized(p1))

def spearman(p0,p1):
    corr, p_value = spearmanr(p0,p1)
    return 1 - corr
    
def fordist(metric,p0,p1):
    if metric=="euclidean":
        return eucldist(p0,p1)
    elif metric=="manhattan":
        return manhattan(p0,p1)
    elif metric=="chebyshev":
        return chebysheb(p0,p1)
    elif metric=="pearson":
        return pearson(p0,p1)
    elif metric=="spearman":
        return spearman(p0,p1)


In [243]:
def cost_function(cluster_centers,cluster,datapoints):
    n = datapoints.shape[0]
    cost = 0.0
    print(n)
    for i in range(n):
        cost += eucldist(cluster_centers[cluster[i]],datapoints[i])
    print("MSE LOSS :",format(cost/n))
    return np.sqrt(cost)/n

In [248]:
def kmeans(k,datapoints,metric):

    d = len(datapoints[0]) 
    
    #Limit our iterations
    Max_Iterations = 30
    i = 0
    
    cluster = [0] * len(datapoints)
    prev_cluster = [-1] * len(datapoints)
    
    #Randomly Choose Centers for the Clusters
    cluster_centers = []
    for i in range(0,k):
        new_cluster = []
        #for i in range(0,d):
        #    new_cluster += [random.randint(0,10)]
        cluster_centers += [random.choice(datapoints)]
        
        
        #Sometimes The Random points are chosen poorly and so there ends up being empty clusters
        #In this particular implementation we want to force K exact clusters.
        #To take this feature off, simply take away "force_recalculation" from the while conditional.
        force_recalculation = False
    
    while (cluster != prev_cluster) and (i < Max_Iterations) :
        
        prev_cluster = list(cluster)
        force_recalculation = False
        i += 1
        #print(i)
        #Update Point's Cluster Alligiance
        for p in range(0,len(datapoints)):
            min_dist = float("inf")
            
            #Check min_distance against all centers
            for c in range(0,len(cluster_centers)):
                
                dist = fordist(metric,datapoints[p],cluster_centers[c])
                
                if (dist < min_dist):
                    min_dist = dist  
                    cluster[p] = c   # Reassign Point to new Cluster
        
        
        #Update Cluster's Position
        for k in range(0,len(cluster_centers)):
            new_center = [0] * d
            members = 0
            for p in range(0,len(datapoints)):
                if (cluster[p] == k): #If this point belongs to the cluster
                    for j in range(0,d):
                        new_center[j] += datapoints[p][j]
                    members += 1
            
            for j in range(0,d):
                if members != 0:
                    new_center[j] = new_center[j] / float(members) 
                
                #This means that our initial random assignment was poorly chosen
                #Change it to a new datapoint to actually force k clusters
                else: 
                    new_center = random.choice(datapoints)
                    force_recalculation = True
                    #print ("Forced Recalculation...")
                    
            
            cluster_centers[k] = new_center
    
        
    #print ("======== Results ========")
    #print ("Clusters", cluster_centers)
    print ("Iterations",i)
    #print ("Assignments", cluster)
    
    return cluster_centers,cluster

In [245]:
filename = '/Users/r17935avinash/Downloads/AirQualityUCI.csv'
data = pd.read_csv(filename)
data= data[['CO_GT', 'PT08_S1_CO', 'NMHC_GT', 'C6H6_GT',
       'PT08_S2_NMHC', 'Nox_GT', 'PT08_S3_Nox', 'NO2_GT', 'PT08_S4_NO2',
       'PT08_S5_O3', 'T', 'RH', 'AH']]
data = data.values

In [246]:
cluster_centers,cluster = kmeans(20,data,"euclidean")
rmse_loss = cost_function(cluster_centers,cluster,data)
print("The RMSE Loss is",format(rmse_loss))

Iterations 30
9357
MSE LOSS : 260.79738404066404
The RMSE Loss is 0.1669487995830879


In [247]:
cluster_centers,cluster = kmeans(20,data,"manhattan")
rmse_loss = cost_function(cluster_centers,cluster,data)
print("The RMSE Loss is",format(rmse_loss))

Iterations 30
9357
MSE LOSS : 264.6138805451314
The RMSE Loss is 0.16816592331619706


In [249]:
cluster_centers,cluster = kmeans(20,data,"chebyshev")
rmse_loss = cost_function(cluster_centers,cluster,data)
print("The RMSE Loss is",format(rmse_loss))

Iterations 30
9357
MSE LOSS : 473.63091078434445
The RMSE Loss is 0.22498403652639778


In [250]:
cluster_centers,cluster = kmeans(20,data,"pearson")
rmse_loss = cost_function(cluster_centers,cluster,data)
print("The RMSE Loss is",format(rmse_loss))

  


Iterations 30
9357
MSE LOSS : 300.35335101651754
The RMSE Loss is 0.1791628350755872


In [252]:
cluster_centers,cluster = kmeans(20,data,"spearman")
rmse_loss = cost_function(cluster_centers,cluster,data)
print("The RMSE Loss is",format(rmse_loss))

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Iterations 26
9357
MSE LOSS : 399.8957677214262
The RMSE Loss is 0.2067307542628519
