## Author: Adam Kim
## Date:   9/24/2017
## Title:  CSI 4352, Assignment 3, Clustering by K-Means

This program was completed for CSI 4352 (Introduction to Data Mining).

Initialize constants.

In [None]:
import os        as myOS    # get current directory
import csv       as myCSV   # handle initial file input
DIMENSION_COUNT  = 12       # number of columns
CLUSTER_SIZE     = 50       # initial cluster size

This function handles getting current cluster for given gene.<br>
The clusters data structure is a map from clusterID to a gene list.<br>
It is necessary to create a helper function to find cluster of which a particular gene is a member.<br>

In [None]:
def fetchCurrentOwner(member,clusters):
    
    current_index = -1

    j = 0

    while j < len(clusters):
        
        genesInCluster = clusters[j]

        if member in genesInCluster:
            current_index = j
            break
        
        j+=1
        
    assert(current_index != -1)
    assert(current_index in clusters)

    return current_index

This functions handles building the inital centroid list.<br>
This function creates 10 centroids, with each centroid having 12 dimensions. <br>
For column c in each centroid, this represents the average of 50 gene timesin that column.<br>

This function will return 2 objects:<br>
An array of centroids will be returned for later usage.<br>
A map of geneId's to associated times will be returned for later usage.<br>

In [None]:
def initalize_centroids(filename):

    # Handle reading from file.

    handle = open(filename, newline='')
    csvHandle = myCSV.reader(handle, delimiter='\t', quotechar='"')

    # centroids stores 10 points in 12-space

    centroids = []
    currentObject = []

    # geneToTimeList maps genes -> [t1,...,t12]

    geneToTimeList = {0: []}
    geneToTimeList.pop(0, None)

    rowCount = 0

    # Insert genes & associated times into geneToTimeList

    for row in csvHandle:

        for point in row:

            if point:

                if rowCount not in geneToTimeList:
                    geneToTimeList[rowCount] = [float(point)]
                else:
                    geneToTimeList[rowCount].append(float(point))

        rowCount += 1

    # Build aggregating object for calculating means for a centroid.

    dimensionHeap = [0.0] * DIMENSION_COUNT

    # Iterate over geneToTimeList map

    i = 0
    while i < len(geneToTimeList):

        timeList = geneToTimeList[i]

        j = 0
        while j < DIMENSION_COUNT:

            # Aggregate sum for each of 12 dimensions.

            dimensionHeap[j] += timeList[j]
            j += 1

        # For every 50 genes, calculate means for each of 12 dimensions.

        if i > 0 and i % CLUSTER_SIZE == CLUSTER_SIZE - 1:

            meanList = []

            for singleDim in dimensionHeap:

                avg = singleDim / float(CLUSTER_SIZE)

                # To match output with hint given in announcement, intermediate calculations were rounded to 2 dec places.

                meanList.append(round(avg, 2))

            # push object containing dim-means of 50 12-dimensional objects

            centroids.append(meanList)

            dimensionHeap = [0.0] * DIMENSION_COUNT

        i += 1

    return (centroids, geneToTimeList)



			

Given centroid list, calculate clusters.
<br>Using manhattan distance, every gene is assigned to cluster whose centroid is closest to that gene.
<br>Return clusters map, which maps clusterID (which is same as centroidID) and list of genes associated with that cluster.

In [None]:
def initalize_clusters(myCentroids, geneToTimeList):

    myClusters = {0:[]}
    myClusters.pop(0,None)

    for gene in geneToTimeList:

        timeList = geneToTimeList[gene]

        i = 0

        distToCentroid = []

        while i < len(myCentroids):

            singleCentroid = myCentroids[i]

            ManhattanDist = 0

            j = 0
            while j < len(timeList):

                ManhattanDist += abs( singleCentroid[j] - timeList[j] )
                j+=1
                
            distToCentroid.append(ManhattanDist)

            i+=1

        winner_index = distToCentroid.index(min(distToCentroid))

        if winner_index in myClusters:
            myClusters[winner_index].append(gene)
        else:
            myClusters[winner_index] = [gene]

    return myClusters

myCentroids is list, indexed 0-9, containing [u1,u2...,u12] time dim means
<br>geneTimeMap maps genes to assoc time lists, indexed 0-499 -> [t1...t12]
<br>myClusters maps index, 0-9, to genes, 0 -> g1,g3,g55,etc assoc with cluster
<br>This function recalculates centroids.
<br>This is done by calculating means for each of 12 different times of genes within the associated cluster.
<br>Once complete, this function updates centroids for genes within its cluster.

In [None]:
def recalculate_centroids(myCentroids, geneTimeMap, myClusters):

    i = 0

    while i < len(myCentroids):

        genesInCluster = myClusters[i]

        clusterSize = len(genesInCluster)

        dimArray = [0]*DIMENSION_COUNT

        for gene in genesInCluster:

            timelist = geneTimeMap[gene]

            j = 0

            while j < len(timelist):

                dimArray[j] += timelist[j]

                j+=1

        k = 0

        newDimMeans = []

        while k < len(dimArray):

            singleDim = dimArray[k] / float(clusterSize)

            # To match output with hint given in announcement,
            # intermediate calculations were rounded to 2 dec places.
            newDimMeans.append( round(singleDim,2) )

            k+=1

        myCentroids[i] = newDimMeans

        i+=1

This function recalculates clusters AFTER centroids are updated
<br>Each gene is assigned to cluster whose centroid it is closest to via the manhattan distance
<br>This function returns a boolean which is FALSE if NO gene is assigned to a different cluster after centroids are updated, or is TRUE if a gene is moved to a different cluster after all centroids are updated.

In [None]:
def recalculate_clusters(myCentroids, geneTimeMap, myClusters):

    changeDetected = False

    for gene in geneTimeMap:

        timeList = geneTimeMap[gene]

        i = 0

        distToCentroid = []

        while i < len(myCentroids):

            currCentroid = myCentroids[i]

            assert(len(timeList)==len(currCentroid))

            ManhattanDist = 0

            j = 0
            while j < len(timeList):

                ManhattanDist += abs( currCentroid[j] - timeList[j] )
                j+=1
                
            distToCentroid.append(ManhattanDist)

            i+=1

        winner_index = distToCentroid.index(min(distToCentroid))

        current_index = fetchCurrentOwner(gene,myClusters)

        if winner_index != current_index:
            changeDetected = True

            myClusters[current_index].remove(gene)
            myClusters[winner_index].append(gene)

    return changeDetected

Driver

In [None]:
def main():
    #(1)  For this assignment, initialize centroid means by row number
    #(2)  Compute the mean point of the objects in each cluster as a centroids
    myCentroids, geneToTimes = initalize_centroids( get_filename() )

    #(3)  Assign each object to the nearest centroid and generate k new clusters
    myClusters = initalize_clusters(myCentroids, geneToTimes)

    #(4)  Repeat (2) and (3), until NO change of the objects in EACH cluster
    changeFound = True
    while changeFound:
        recalculate_centroids(myCentroids, geneToTimes, myClusters)
        changeFound = recalculate_clusters(myCentroids,geneToTimes,myClusters)



    # These following lines ensure that output follows exact specification
    #   in assignment instructions:
    #   'In an output file, show each cluster at each line
    #       starting with the size of the cluster,
    #   'for example, "6: {1, 24, 56, 139, 285, 471}".'
    finalResults = []

    for index in myClusters:
        finalResults.append([len(myClusters[index]),sorted(myClusters[index])])

    finalResults.sort(key = lambda k: k[0])

    outFile = open(myOS.path.abspath("output-AdamKim.txt"),'w')

    for result in finalResults:
        print('')
        sentence = str(result[0])+' : ' + str(result[1])
        print(sentence.replace('[','{').replace(']','}'))
        outFile.write('\n'+sentence.replace('[','{').replace(']','}') + '\n')
        print('')

    outFile.close()

################################################################################

if __name__ == "__main__":
    main()