In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
import numpy as np

## Spark Session

In [2]:
spark = SparkSession.builder.appName('clustering').getOrCreate()
sc=spark.sparkContext

## Read Data

In [4]:
rdd= sc.textFile('../../Dataset/Google-Playstore-RDD.csv')
header= rdd.first()
rdd= rdd.filter(lambda x: x!=header)
rdd.take(1)

['Gakondo,com.ishakwe.gakondo,Adventure,0.0,0.0,10+,10,15,True,0.0,USD,10M,7.1 and up,Jean Confident Irénée NIYIZIBYOSE,https://beniyizibyose.tk/#/,jean21101999@gmail.com,Feb 26  2020,Feb 26  2020,Everyone,https://beniyizibyose.tk/projects/,False,False,False,2021-06-15 20:19:35']

# K-mean Clustering

## Features 
- Rating, Maximum Installs, Size

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.cluster import KMeans

In [6]:
k=4

In [7]:
def compute_closest_centroid(x, y, z,centroids):
    features = [x, y, z]

    # Calculate the minimum distance between each point and each centroid
    distances = []
    for centroid in centroids:
        distance = 0
        for i in range(len(features)):
            distance += (features[i] - centroid[i]) ** 2
        distances.append(distance ** 0.5)

    # Return the index with the smallest distance
    return distances.index(min(distances))


centroids= [[0.00e+00, 2.21e+02, 1.70e-06],\
             [4.1000e+00, 3.6002e+04, 8.8000e-06],\
             [2.6000e+00, 1.8539e+04, 1.1000e+01]]

### Size column

In [8]:
def convert_to_bytes(size_str):
    suffixes = {'G': 1000000000, 'M': 1000000, 'k': 1000}
    try:
        for suffix, multiplier in suffixes.items():
            if suffix in size_str:
                return float(size_str[:-1]) * multiplier
    except:
        return 0.0

## Apply Map-Reduce


In [9]:
'''
map => compute distance between each point and the centroids 
produce  key- value pair <index of mean with min- distance, value of the features>
reduce => compute the new means for each cluster

'''

'\nmap => compute distance between each point and the centroids \nproduce  key- value pair <index of mean with min- distance, value of the features>\nreduce => compute the new means for each cluster\n\n'

In [10]:
new_centroids =[]
M=20
i=0

# key: index of the mean with min- distance
# value: (Rating, Maximum Installs, Size) 
while i < M:
    i+=1
    
    final_result= rdd.filter(lambda x: x.split(',')[11]!='Varies with device' and x.split(',')[11]!='' and x.split(',')[7]!='' and x.split(',')[3]!='')\
            .map(lambda x: (float(x.split(',')[3]),int(x.split(',')[7]),convert_to_bytes(x.split(',')[11])) )\
            .map(lambda x:(compute_closest_centroid(x[0],x[1],x[2],centroids),(x[0],x[1],x[2])))\
            .mapValues(lambda x: (x[0],x[1],x[2],1,1,1))\
            .reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1],x[2]+y[2],x[3]+1,x[4]+1,x[5]+1))\
            .mapValues(lambda x: (x[0]/x[3],x[1]/x[4],x[2]/x[5]))
            
    new_centroids= [item[1] for item in np.array(final_result.collect())]
    print("New: ",new_centroids)
    print("Old: ", centroids)
    if centroids != new_centroids :
        centroids = new_centroids
    else:
        break

  new_centroids= [item[1] for item in np.array(final_result.collect())]


New:  [(29.54839038020183, 16029.299091906996, 125176422.39896218), (78.19141468296509, 30453273.7142755, 442015875.7380799), (48.61199385793295, 123733.78833007299, 641534556.6248081)]
Old:  [[0.0, 221.0, 1.7e-06], [4.1, 36002.0, 8.8e-06], [2.6, 18539.0, 11.0]]
New:  [(43.77262534282059, 3853324.37005037, 383060194.0102533), (29.491891891891886, 61950299.567567565, 3359918918.918919), (17.770833333333332, 19726987.625, 3776833333.3333335)]
Old:  [(29.54839038020183, 16029.299091906996, 125176422.39896218), (78.19141468296509, 30453273.7142755, 442015875.7380799), (48.61199385793295, 123733.78833007299, 641534556.6248081)]
New:  [(43.77720368826996, 3877312.3117198492, 384907471.4060414)]
Old:  [(43.77262534282059, 3853324.37005037, 383060194.0102533), (29.491891891891886, 61950299.567567565, 3359918918.918919), (17.770833333333332, 19726987.625, 3776833333.3333335)]
New:  [(43.77720368826996, 3877312.3117198492, 384907471.4060414)]
Old:  [(43.77720368826996, 3877312.3117198492, 384907

In [20]:
clusters= final_result.mapValues(lambda x: (round(x[0],2),int(x[1]),round(x[2]/1000000,2)))\
                    .groupByKey().mapValues(list)

In [21]:
collected_clusters=clusters.collect()
print(collected_clusters)

[(0, [(43.77720368826996, 3877312.3117198492, 384907471.4060414)])]


In [None]:
collected_clusters = np.array(collected_clusters)

In [14]:
clusters_it=  [item[1] for item in collected_clusters]
print(clusters_it[0])


[(43.78, 3877312, 384.91)]


In [15]:
actual_clusters=[]
for i in clusters_it:
   actual_clusters.append([x[0] for x in i])