Content:
    1. Simple K-Means Cluster on 2D data
    2. General K-Means Cluster works with data of any dimension
    3. Simple Image Compression (Don't use large image file)
    4. Optimized Image Compression

In [1]:
import random
import math
from tqdm import tnrange, tqdm_notebook
from matplotlib import pyplot
from pprint import pprint
from functools import lru_cache
def euclidian_distance(x1,y1,x2,y2):
    return math.sqrt((y1-y2)**2 + (x1-x2)**2)

def vector_addiction(vector1, vector2):
    return [i+j for i,j in zip(vector1, vector2)]

def scalar_multiplication(vector, scalar):
    return [scalar*i for i in vector]

def index_where_equals(vector, value):
    return [i for i in range(len(vector)) if vector[i] == value]

def random_list(no,minimum=0,maximum=100):
    return [random.randint(minimum, maximum) for i in range(no)]

def general_euclidian_distance(vector1, vector2):
    temp = sum([(i-j)**2 for i,j in zip(vector1,vector2)])
    return math.sqrt(temp)

def get_column(matrix, column):
    return list(map(lambda x: x[column], matrix))

# 1. Simple K-Means Cluster on 2D data

In [None]:
class KMeans:
    def __init__(self,clusters = 2):
        self.clusters = clusters
    
    def fit(self ,x,y, iterations=5):
        self.x = x
        self.y = y
        
        x_min, x_max = min(x), max(x)
        y_min, y_max = min(x), max(x)
    
        self.cluster_center = [[random.randint(x_min, x_max),random.randint(y_min, y_max )] for i in range(self.clusters)]

        for i in range(iterations):
            self.assign_cluster()
            self.visualize(str(i)+'assign')
            self.relocate_clusters()
            self.visualize(str(i)+'relocate')
        
        self.assign_cluster()
        
    def slow_fit(self, x,y):
        self.x = x
        self.y = y
        
        x_min, x_max = min(x), max(x)
        y_min, y_max = min(x), max(x)
        
    
        self.cluster_center = [[random.randint(x_min, x_max),random.randint(y_min, y_max )] for i in range(self.clusters)]

    def nearest_cluster(self,x,y):
        min_cluster = None
        min_distance = None
        
        for i in range(len(self.cluster_center)):
            cluster_x, cluster_y = self.cluster_center[i]
            
            distance = euclidian_distance(cluster_x, cluster_y, x, y)
            
            if not min_distance or min_distance > distance:
                min_distance = distance
                min_cluster = i
        
        return min_cluster
    
    def assign_cluster(self):
        self.cluster_assignment = []
        
        for i in range(len(self.x)):
            self.cluster_assignment.append(self.nearest_cluster(self.x[i], self.y[i]))
        
        return self.cluster_assignment
    
    def relocate_clusters(self):
        temp = {}
        
        no_of_points_in_cluster = {}
        
        for cluster_no,i,j in zip(self.cluster_assignment,x,y):
            temp.setdefault(cluster_no,[0,0])
            temp[cluster_no] = vector_addiction(temp[cluster_no],[i,j])
            
            no_of_points_in_cluster.setdefault(cluster_no, 0)
            no_of_points_in_cluster[cluster_no] += 1
        
        for key, value in temp.items():
            self.cluster_center[key] = scalar_multiplication(value, 1/no_of_points_in_cluster[key])
        
        return self.cluster_center
    
    def visualize(self, save_name = None):
        graph = pyplot
        graph.clf()
        cluster_x = [i[0] for i in self.cluster_center]
        cluster_y = [i[1] for i in self.cluster_center]
        
        colors = ['r', 'b', 'g', 'k', 'm']
        markers = ['P','v','*','s','X']

        for i in range(len(self.cluster_center)):
            point_no = index_where_equals(self.cluster_assignment,i)
            
            x = [self.x[i] for i in point_no]
            y = [self.y[i] for i in point_no]
            graph.scatter(x, y, c= colors[i])

            graph.scatter([cluster_x[i]], [cluster_y[i]],c=colors[i], marker = markers[i])
        
        
        if save_name:
            graph.title = save_name
            graph.savefig('/Users/ankushchoubey/Desktop/'+str(save_name) +'.png')

In [None]:
no_of_points = 10
x = random_list(no_of_points,0,40) + random_list(no_of_points,80,100) + random_list(no_of_points+10,0,1000)
y = random_list(no_of_points,0,40) + random_list(no_of_points,80,100) + random_list(no_of_points+10,0,1000)


print(x)
print(y)
engine = KMeans(3)
engine.fit(x,y, 10)

%matplotlib inline
engine.visualize()

# 2. General K-Means Cluster works with data of any dimension

In [2]:
class General_KMeans:
    def __init__(self,clusters = 2):
        self.clusters = clusters
    
    def initial_cluster_center(self, k):
        cluster_center = []
        for b in range(k):
            temp = []
            for i, j in zip(self.min_values,self.max_values):
                temp.append(random.randint(i, j))
            cluster_center.append(temp)
        return cluster_center
        
    def fit(self,X,iterations=5):
        self.X = X
        
        self.transpose = [self.get_column(X,i) for i in range(len(X[0]))]
        
        self.min_values, self.max_values = [min(i) for i in X], [max(i) for i in X]

        print('min/max',self.min_values, self.max_values)
        self.no_of_dimension = len(X)
        can_visualise = True if self.no_of_dimension == 2 else False
        self.cluster_center = self.initial_cluster_center(self.clusters)

        print('no of dimensions',self.no_of_dimension)
        #print(self.cluster_center)
        

        for i in tqdm_notebook(range(iterations)):
            self.assign_cluster()

            if can_visualise:
                self.visualize(str(i)+'assign')

            self.relocate_clusters()
            
            if can_visualise:
                self.visualize(str(i)+'relocate')
            
        self.assign_cluster()
        if not can_visualise:
            print('Can only visualise 2d data')
        
        del self.X
        del self.transpose 
            
        
    def slow_fit(self, x,y):
        self.x = x
        self.y = y
        
        x_min, x_max = min(x), max(x)
        y_min, y_max = min(x), max(x)
    
        self.cluster_center = [[random.randint(x_min, x_max),random.randint(y_min, y_max )] for i in range(self.clusters)]

    def nearest_cluster(self,point):
        min_cluster = None
        min_distance = None
        
        for i in range(len(self.cluster_center)):
            #cluster_x, cluster_y = self.cluster_center[i]
            
            distance = general_euclidian_distance(self.cluster_center[i], point)
            
            if not min_distance or min_distance > distance:
                min_distance = distance
                min_cluster = i
        
        return min_cluster
    
    def get_column(self,matrix, column):
        return list(map(lambda x: x[column], matrix))
    
    def assign_cluster(self):
        self.cluster_assignment = []
        
        for i in tqdm_notebook(range(len(self.X[0])),desc='assignming cluster centers'):
            
            point = self.get_column(self.X,i)
            
            self.cluster_assignment.append(self.nearest_cluster(point))
        
        return self.cluster_assignment
    
    def relocate_clusters(self):
        temp = {}
        
        no_of_points_in_cluster = {}
        
        for cluster_no,vector2 in tqdm_notebook(zip(self.cluster_assignment,self.transpose), desc='relocating cluters'):
            temp.setdefault(cluster_no,[0 for i in range(self.no_of_dimension)])
            temp[cluster_no] = vector_addiction(temp[cluster_no],vector2)
            
            no_of_points_in_cluster.setdefault(cluster_no, 0)
            no_of_points_in_cluster[cluster_no] += 1
        
        for key, value in temp.items():
            self.cluster_center[key] = scalar_multiplication(value, 1/no_of_points_in_cluster[key])
        
        return self.cluster_center
    
    def visualize(self, save_name = None):
        
        if self.no_of_dimension !=2:
            print('Can only visualise 2')
            return
        graph = pyplot
        graph.clf()
        cluster_x = [i[0] for i in self.cluster_center]
        cluster_y = [i[1] for i in self.cluster_center]
        
        colors = ['r', 'b', 'g', 'k', 'm']
        markers = ['P','v','*','s','X']

        for i in range(len(self.cluster_center)):
            point_no = index_where_equals(self.cluster_assignment,i)
            
            x = [self.X[0][i] for i in point_no]
            y = [self.X[1][i] for i in point_no]
            graph.scatter(x, y, c= colors[i])

            graph.scatter([cluster_x[i]], [cluster_y[i]],c=colors[i], marker = markers[i])
        
        
        if save_name:
            graph.title = save_name
            graph.savefig('/Users/ankushchoubey/Desktop/'+str(save_name) +'.png')

In [None]:
# check it step by step
engine_general = General_KMeans(5)

engine_general.fit([x,y])
engine_general.cluster_assignment
pprint(engine_general.cluster_center)
print(engine_general.nearest_cluster([1,2]))

# 3. Simple Image Compression (Don't use large image file)

In [None]:
no_of_colors = 16

In [3]:
from PIL import Image

In [4]:
image_large = Image.open('/Users/ankushchoubey/Downloads/large.jpg')
#image.show()
image_small = Image.open('/Users/ankushchoubey/Downloads/image.jpg')

In [37]:
def compress(im, colors, iterations):
    width, height = im.size
    x_l,y_l,z_l = [],[],[]
    
    for i in tqdm_notebook(list(im.getdata()), desc='getting image data'):
        x,y,z = i
        x_l.append(x)
        y_l.append(y)
        z_l.append(z)
    engine = General_KMeans(colors)
    engine.fit([x_l,y_l,z_l],iterations=iterations)
    
    new_pixel_values = []
    for i in tqdm_notebook(list(im.getdata())):
        nearest_cluster = engine.nearest_cluster(list(i))
        cluster_point = engine.cluster_center[nearest_cluster]
        cluster_point = [int(i) for i in cluster_point]
        change_color = tuple(cluster_point)
        new_pixel_values.append(change_color)
    im2 = Image.new(im.mode, im.size)
    im2.putdata(new_pixel_values)
    return im2

In [38]:
a = compress(image_small,16,5)


min/max [0, 0, 0] [255, 255, 255]
no of dimensions 3


KeyboardInterrupt: 

In [None]:
a.show()
a.save('/Users/ankushchoubey/Downloads/compressed.jpg')

# 4. Optimized Image Compression

In [33]:
def min_max_scaling(x, x_min, x_max,new_x_min,new_x_max):
    return (x - x_min)*(new_x_max-new_x_min)/(x_max-x_min) + new_x_min
@lru_cache(maxsize=256)
def scale8_bit(no):
    return int((no*6/256)*36)

In [55]:
class ImageCompress:
    @lru_cache()
    def min_max_scaling(self,x):
        return (x - self.mini)*self.product_factor + self.range_start
    @lru_cache(maxsize=520800)
    def nearest_color(self, r,g,b):
        cluster_at_index = self.kmeans.nearest_cluster([r,g,b])
        cluster_point = self.kmeans.cluster_center[cluster_at_index]
        return tuple([int(i) for i in cluster_point])
    def fit(self, image,no_of_colors,iterations, quality=0.1):
        width, height = image.size
        colors = image.getcolors(width*height)
        
        no = get_column(colors,0)
        scaled_version = []
        
        self.mini = min(no)
        self.maxi = max(no)
        
        self.range_start = 0
        self.range_end = self.maxi*quality
        
        self.product_factor = (self.range_end-self.range_start)/(self.maxi-self.mini)
        
        for i in tqdm_notebook(no, desc='Scaling Values'):
             scaled_version.append(int(self.min_max_scaling(i)))

        self.min_max_scaling.cache_clear()
        x_l,y_l,z_l=[],[],[]
        for i in tqdm_notebook(range(len(scaled_version)), desc='getting image data'):
            if scaled_version[i]==0:
                continue
            
            for j in range(scaled_version[i]):
                x,y,z = colors[i][1]
                x, y,z = scale8_bit(x), scale8_bit(y),scale8_bit(z)
                x_l.append(x)
                y_l.append(y)
                z_l.append(z)
                
        self.kmeans = General_KMeans(no_of_colors)
        self.kmeans.fit([x_l,y_l,z_l],iterations=iterations)
        del x_l, y_l, z_l
        new_pixel_values=[]
        
        for i in tqdm_notebook(list(image.getdata())):
            change_color = self.nearest_color(i[0],i[1],i[2])
            new_pixel_values.append(change_color)
        self.nearest_color.cache_clear()
        im2 = Image.new(image.mode, image.size)
        im2.putdata(new_pixel_values)
        
        return im2

In [56]:
compressor = ImageCompress()
compressed_image = compressor.fit(image_large,no_of_colors=32,iterations=2, quality=0.025)





min/max [0, 0, 0] [215, 215, 215]
no of dimensions 3






Can only visualise 2d data





In [None]:
image.getpalette()

In [57]:
compressed_image.save('/Users/ankushchoubey/Downloads/fast_compressed_small.jpg')