# Gene Expression Clustering by K-Means
## Adam Kim

*This program was completed for CSI 4352 (Introduction to Data Mining).*

In [None]:
import os        as myOS    # get current directory
import csv       as myCSV   # handle initial file input
DIMENSION_COUNT  = 12       # number of columns
CLUSTER_SIZE     = 50       # initial cluster size

*For a given gene, this function grabs its cluster.*

In [None]:
def fetchCurrentOwner(member,clusters):
    
    current_index = -1

    j = 0

    while j < len(clusters):
        
        genesInCluster = clusters[j]

        if member in genesInCluster:
            
            current_index = j
            
            break
        
        j+=1
        
    # Assert that gene exists in exactly one cluster.
    
    assert(current_index != -1)
    
    assert(current_index in clusters)

    return current_index

*This function creates 10 centroids, with each centroid having 12 dimensions.*

In [None]:
def initalize_centroids(filename):

    # Handle reading from file.
    
    handle = open(filename, newline='')
    
    csvHandle = myCSV.reader(handle, delimiter='\t', quotechar='"')

    # centroids stores 10 points in 12-space

    centroids = []
    
    currentObject = []

    # geneToTimeList maps genes -> [t1,...,t12]

    geneToTimeList = {0: []}
    
    geneToTimeList.pop(0, None)

    rowCount = 0

    # Insert genes & associated times into geneToTimeList

    for row in csvHandle:

        for point in row:

            if point:

                if rowCount not in geneToTimeList:
                    geneToTimeList[rowCount] = [float(point)]
                else:
                    geneToTimeList[rowCount].append(float(point))

        rowCount += 1

    # Build aggregating object for calculating means for a centroid.

    dimensionHeap = [0.0] * DIMENSION_COUNT

    # Iterate over geneToTimeList map

    i = 0
    while i < len(geneToTimeList):

        timeList = geneToTimeList[i]

        j = 0
        while j < DIMENSION_COUNT:

            # Aggregate sum for each of 12 dimensions.

            dimensionHeap[j] += timeList[j]
            j += 1

        # For every 50 genes, calculate means for each of 12 dimensions.

        if i > 0 and i % CLUSTER_SIZE == CLUSTER_SIZE - 1:

            meanList = []

            for singleDim in dimensionHeap:

                avg = singleDim / float(CLUSTER_SIZE)

                # To match output with hint given in announcement, 
                # intermediate calculations were rounded to 2 dec places.

                meanList.append(round(avg, 2))

            # push object containing dim-means of 50 12-dimensional objects

            centroids.append(meanList)

            dimensionHeap = [0.0] * DIMENSION_COUNT

        i += 1

    return (centroids, geneToTimeList)

*This function assigns every gene to a cluster whose centroid is closest to that gene.*  

*Proximity calculation uses Manhattan (L1) distance.*  


In [None]:
def initalize_clusters(myCentroids, geneToTimeList):

    myClusters = {0:[]}
    myClusters.pop(0,None)

    for gene in geneToTimeList:

        timeList = geneToTimeList[gene]

        i = 0

        distToCentroid = []

        while i < len(myCentroids):

            singleCentroid = myCentroids[i]

            ManhattanDist = 0

            j = 0
            while j < len(timeList):

                ManhattanDist += abs( singleCentroid[j] - timeList[j] )
                j+=1
                
            distToCentroid.append(ManhattanDist)

            i+=1

        winner_index = distToCentroid.index(min(distToCentroid))

        if winner_index in myClusters:
            
            myClusters[winner_index].append(gene)
            
        else:
            
            myClusters[winner_index] = [gene]

    return myClusters

*This function recalculates the centroid for each cluster*

*This recalculation takes the average of its member genes and assigns a new centroid for the cluster*


In [None]:
def recalculate_centroids(myCentroids, geneTimeMap, myClusters):

    i = 0

    while i < len(myCentroids):

        genesInCluster = myClusters[i]

        clusterSize = len(genesInCluster)

        dimArray = [0]*DIMENSION_COUNT

        for gene in genesInCluster:

            timelist = geneTimeMap[gene]

            j = 0

            while j < len(timelist):

                dimArray[j] += timelist[j]

                j+=1

        k = 0

        newDimMeans = []

        while k < len(dimArray):

            singleDim = dimArray[k] / float(clusterSize)

            # To match output with hint given in announcement,
            # intermediate calculations were rounded to 2 dec places.
            newDimMeans.append( round(singleDim,2) )

            k+=1

        myCentroids[i] = newDimMeans

        i+=1

*This function recalculates clusters after centroids are recalculated.*

*Each gene is assigned to cluster whose centroid it is closest to via the Manhattan (L1) distance.*

*Returns FALSE if no gene is assigned to a different cluster.*

*Returns TRUE  if at least one gene is moved to a different cluster.*


In [None]:
def recalculate_clusters(myCentroids, geneTimeMap, myClusters):

    changeDetected = False

    for gene in geneTimeMap:

        timeList = geneTimeMap[gene]

        i = 0

        distToCentroid = []

        while i < len(myCentroids):

            currCentroid = myCentroids[i]

            assert(len(timeList)==len(currCentroid))

            ManhattanDist = 0

            j = 0
            while j < len(timeList):

                ManhattanDist += abs( currCentroid[j] - timeList[j] )
                j+=1
                
            distToCentroid.append(ManhattanDist)

            i+=1

        winner_index = distToCentroid.index(min(distToCentroid))

        current_index = fetchCurrentOwner(gene,myClusters)

        if winner_index != current_index:
            
            changeDetected = True

            myClusters[current_index].remove(gene)
            
            myClusters[winner_index].append(gene)

    return changeDetected

Driver for this assignment.

This program clusters genes according to expressions measured at 12 different time intervals.

To do this, the k-means clustering was implemented from scratch, where the instructor required that K=10 for this program.

In [None]:
def main():
    
    #(1)  For this assignment, initialize centroid means by row number
    
    #(2)  Compute the mean point of the objects in each cluster as a centroids
    
    myCentroids, geneToTimes = initalize_centroids( get_filename() )

    #(3)  Assign each object to the nearest centroid and generate k new clusters
    myClusters = initalize_clusters(myCentroids, geneToTimes)

    #(4)  Repeat (2) and (3), until NO change of the objects in EACH cluster
    
    changeFound = True
    while changeFound:
        recalculate_centroids(myCentroids, geneToTimes, myClusters)
        changeFound = recalculate_clusters(myCentroids,geneToTimes,myClusters)
    
    # Output to text file according to assignment requirements.  
    
    # Output below
    
    finalResults = []

    for index in myClusters:
        finalResults.append([len(myClusters[index]),sorted(myClusters[index])])

    finalResults.sort(key = lambda k: k[0])

    outFile = open(myOS.path.abspath("output-AdamKim.txt"),'w')

    for result in finalResults:
        print('')
        sentence = str(result[0])+' : ' + str(result[1])
        print(sentence.replace('[','{').replace(']','}'))
        outFile.write('\n'+sentence.replace('[','{').replace(']','}') + '\n')
        print('')

    outFile.close()

################################################################################

if __name__ == "__main__":
    main()

_Program Output:_

These following lines ensure that output follows exact specification in assignment instructions:

In an output file, show each cluster at each line starting with the size of the cluster,

for example, "6: {1, 24, 56, 139, 285, 471}".


In [None]:
2 : {345, 473}

4 : {424, 453, 454, 455}

5 : {491, 496, 497, 498, 499}

8 : {342, 343, 344, 350, 351, 357, 360, 361}

17 : {293, 306, 307, 308, 309, 310, 311, 314, 315, 316, 317, 320, 321, 378, 380,
381, 429}

18 : {394, 407, 409, 410, 417, 418, 420, 421, 422, 423, 428, 438, 439, 440, 447,
451, 452, 474}

19 : {245, 246, 326, 327, 330, 331, 332, 476, 480, 483, 485, 486, 487, 488, 490,
492, 493, 494, 495}

54 : {333, 335, 336, 337, 338, 339, 340, 341, 346, 347, 348, 349, 352, 353, 354,
355, 356, 358, 359, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 374, 391,
392, 411, 412, 414, 415, 416, 419, 457, 458, 459, 461, 463, 464, 465, 466, 467,
468, 469, 470, 471, 472, 477, 489}

69 : {100, 283, 284, 290, 291, 294, 295, 302, 303, 304, 305, 312, 313, 319, 322,
323, 324, 325, 372, 373, 375, 376, 377, 379, 382, 383, 384, 385, 386, 387, 388,
389, 390, 393, 395, 396, 397, 398, 400, 401, 402, 403, 404, 405, 406, 408, 413,
425, 426, 427, 430, 431, 432, 433, 434, 435, 436, 437, 441, 442, 443, 444, 445,
446, 448, 449, 450, 462, 475}

304 : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101,
102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,
214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 247,
248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263,
264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279,
280, 281, 282, 285, 286, 287, 288, 289, 292, 296, 297, 298, 299, 300, 301, 318,
328, 329, 334, 399, 456, 460, 478, 479, 481, 482, 484}