# Gene Expression Clustering by DBSCAN Algorithm
## Adam Kim

*This program was completed for CSI 4352 (Introduction to Data Mining).*

*This function creates the gene -> feature dictionary.*

*The gene index is the key, and the 12 datapoints for gene is the value*

In [None]:
def init_DB(filename):

    handle      = open(filename,newline='')
    csvHandle   = myCSV.reader(handle,delimiter='\t',quotechar='"')

    geneDB      = []
    rowCount    = 0

    for row in csvHandle:

        newRow = []

        for point in row:
            
            if point:

                newRow.append(float(point))

        geneDB.append(newRow)

        assert(len(newRow) == DIMS)

    assert(len(geneDB) == NUMOBS)

    return geneDB

*This function builds distance matrix for genes.*

*Using average link distance between clusters and Manhattan (L1) distance between cluster members.*

*For distance of gene to itself, insert infinity to prevent 0s on diagonal.*

In [None]:
def init_distance_matrix(geneDB):

    distMatrix = []

    i = 0

    while i < len(geneDB):

        j = 0

        row = []

        while j < len(geneDB):

            if i != j:

                rootTimes     = geneDB[i]
                neighborTimes = geneDB[j]

                dist = manhattan(rootTimes,neighborTimes)
                
                row.append(dist)
                
            else: # Genes are same, so distance is 0

                row.append(0.0)

            j+=1

        distMatrix.append(row)

        assert(len(row) == NUMOBS) # check rows

        i+=1

    assert(len(distMatrix) == NUMOBS) # check columns
        
    return distMatrix

*This function calculates the Manhattan (L1) distance between two genes.*

In [None]:
def manhattan(timeListA,timeListB):

    assert( len(timeListA) == DIMS)
    assert( len(timeListA) == len(timeListB) )

    manhattanDist   = 0.0
    
    itr = 0

    while itr < DIMS:

        manhattanDist += abs( timeListA[itr] - timeListB[itr] )

        itr += 1

    return manhattanDist

This implements the DBSCAN Algorithm.

DBSCAN is a density based clustering algorithm.

In [None]:
def DBSCAN(eps,minPoints,distMatrix,geneDB):

    gene = 0

    mp = minPoints

    clusters = [] # list of lists

    outliers = [] # list of ints
    notVisited = list(range(NUMOBS))

    while gene < len(geneDB):

        if gene in notVisited:

            notVisited.remove(gene)

            nb = scanRegion(gene, eps, distMatrix)

            if isCore(nb,minPoints): # returns true if gene is a core

                newCl = generate(gene,nb,eps,mp,distMatrix,notVisited,outliers)

                clusters.append(newCl)

            else: # outlier found here

                outliers.append(gene)
                
        gene+=1

    assert( notVisited == [] )

    return clusters,outliers

This function returns true if a point has MINPOINTS points with an EPSILON radius and returns false otherwise.

In [None]:
 def isCore(neighborList, minPoints):

    if len(neighborList) >= minPoints:
        return True
    else:
        return False

This function generates a cluster from a start gene by iteratively crawling the network of genes until no more cores are detected with in an EPSILON radius for genes on outer edge of cluster.

In [None]:
def generate(root,neighbors,eps,minPoints,distMatrix,notVisited,outliers):

    # root is first core discovered
    # neighbors are border/core points in first core radius

    # eps is search radius
    # minPoints is min number of points to be considered core

    # distMatrix contains distance info
    # notVisited contains unvisited nodes
    # outliers contains outliers

    newCluster = [root]

    j = 0 

    while j < len(neighbors):

        neighborGene = neighbors[j]

        if neighborGene in outliers:
            
            outliers.remove(neighborGene)
            newCluster.append(neighborGene)

        elif neighborGene in notVisited:

            newCluster.append(neighborGene)
            notVisited.remove(neighborGene)

            localNeighbors = scanRegion(neighborGene,eps, distMatrix)

            if isCore(localNeighbors,minPoints): 

                safeMerge(neighbors,localNeighbors) 
            
        j+=1

    return newCluster

This function implements the union function for lists.

In [None]:
def safeMerge(masterList,proposedList):

    masterSet = set(masterList)
    proposedSet = set(proposedList)

    uniqueSet = proposedSet - masterSet
    
    uniqueList = list(uniqueSet)

    for item in uniqueList:

        masterList.append(item) # add to end

This function scans an EPSILON radius about a point and returns all points within said radius.

N.B. this includes the point itself, so scanRegion function will always return at least one point.

In [None]:
def scanRegion(i, epsilon, distMatrix):

    i_distances = distMatrix[i]

    neighbors = []

    itr = 0

    while itr < len(i_distances):

        if i_distances[itr] < epsilon:

            neighbors.append(itr)

        itr += 1

    return neighbors   

Driver for DBSCAN Algorithm.

In [None]:
def main():

    filename = fetch_filename()
    geneDB = init_DB(filename)
    distMatrix = init_distance_matrix(geneDB)

    # list for epsilon and minPoints
    epsList   = [1,2,3,4]
    minPtsList = [2,4,6]

    for epsilon in epsList:

        for minPoints in minPtsList:

            clusters,outliers = DBSCAN(epsilon,minPoints,distMatrix,geneDB)

            # correct off by one in CLUSTERS
            for cluster in clusters:
                i = 0
                while i < len(cluster):
                    cluster[i] +=1
                    i+=1

            # correct off by one in OUTLIERS
            j = 0
            while j < len(outliers):
                outliers[j] += 1
                j+=1

            print('\n')
            print('For Epsilon   =',epsilon)
            print('For minPoints =',minPoints)
            print('')

            print('OUTLIERS\t:\t',len(outliers))

            i = 0
            for cluster in sorted(clusters,key=len):

                print('Cluster',i,'\t:\t',len(cluster))
                i+=1

            myFilename =  'e' + '' + str(epsilon)
            myFilename += 'm' + str(minPoints)  
            myFilename += '.txt'

            # open output file

            outFile = open(myOS.path.abspath(myFilename),'w')

            safetyCheck = 0

            # print outliers to output file

            for outlier in outliers:

                cluster = [outlier]

                sentence = str(len(cluster)) + ' : ' + stringify(cluster)

                safetyCheck += len(cluster) # check

                outFile.write(sentence + '\n')
                outFile.write('\n')

            # print clusters to output file

            for cluster in sorted(clusters,key=len):

                sentence = str(len(cluster)) + ' : ' + stringify(cluster)

                safetyCheck += len(cluster) # check

                outFile.write(sentence + '\n')
                outFile.write('\n')

            outFile.close()

if __name__ == "__main__":
    main()

The below output was generated using an EPSILON = 4 and MINPOINTS = 4.

In [None]:
395 : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 20, 22, 23, 24, 
       25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 43, 
       44, 45, 46, 47, 49, 51, 52, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 
       66, 68, 72, 73, 74, 75, 79, 81, 82, 83, 86, 87, 88, 89, 90, 91, 92, 
       93, 94, 95, 96, 97, 98, 99, 100, 104, 105, 107, 108, 109, 110, 111, 
       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 126, 127, 129, 
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       145, 146, 147, 148, 149, 150, 151, 152, 153, 155, 156, 157, 158, 159, 
       160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 
       174, 175, 176, 177, 178, 179, 180, 181, 183, 184, 185, 186, 187, 188, 
       189, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 203, 204, 205, 
       206, 207, 208, 213, 214, 215, 217, 218, 219, 220, 221, 222, 223, 224, 
       225, 226, 227, 228, 229, 230, 231, 233, 234, 235, 236, 237, 238, 239, 
       240, 241, 242, 243, 244, 245, 248, 249, 250, 254, 255, 256, 257, 258, 
       259, 261, 262, 264, 265, 266, 267, 268, 273, 277, 281, 288, 289, 290, 
       293, 298, 299, 300, 301, 400, 482, 130, 260, 263, 13, 14, 270, 18, 19, 
       275, 21, 278, 154, 40, 42, 48, 50, 53, 60, 65, 67, 69, 70, 71, 76, 77,
       78, 80, 209, 210, 84, 85, 212, 106, 251, 252, 123, 124, 128, 232, 201, 
       202, 253, 182, 216, 125, 191, 287, 279, 269, 272, 211, 286, 103, 457, 
       461, 271, 302, 297, 102, 280, 292, 101, 295, 426, 401, 404, 374, 483, 
       480, 274, 305, 283, 479, 291, 296, 378, 284, 319, 320, 434, 462, 463, 
       373, 443, 444, 414, 282, 460, 366, 359, 385, 435, 387, 303, 313, 314, 
       323, 285, 409, 386, 402, 445, 390, 485, 306, 406, 338, 478, 405, 449, 
       393, 396, 437, 407, 442, 446, 415, 458, 364, 413, 403, 368, 412, 436, 
       392, 459, 360, 389, 427, 377, 380, 383, 384, 438, 304, 324, 325, 450, 
       398, 355, 399, 388, 431, 420, 363, 416, 365, 451, 369, 357, 417, 341,
       397, 432, 309, 326, 394, 342, 433, 356, 367, 337, 350}

4 : {410, 411, 419, 452}

12 : {307, 308, 312, 322, 317, 310, 311, 321, 316, 315, 381, 294}
    
1 : {246}

1 : {247}

1 : {276}

1 : {318}

1 : {327}

1 : {328}

1 : {329}

1 : {330}

1 : {331}

1 : {332}

1 : {333}

1 : {334}

1 : {335}

1 : {336}

1 : {339}

1 : {340}

1 : {343}

1 : {344}

1 : {345}

1 : {346}

1 : {347}

1 : {348}

1 : {349}

1 : {351}

1 : {352}

1 : {353}

1 : {354}

1 : {358}

1 : {361}

1 : {362}

1 : {370}

1 : {371}

1 : {372}

1 : {375}

1 : {376}

1 : {379}

1 : {382}

1 : {391}

1 : {395}

1 : {408}

1 : {418}

1 : {421}

1 : {422}

1 : {423}

1 : {424}

1 : {425}

1 : {428}

1 : {429}

1 : {430}

1 : {439}

1 : {440}

1 : {441}

1 : {447}

1 : {448}

1 : {453}

1 : {454}

1 : {455}

1 : {456}

1 : {464}

1 : {465}

1 : {466}

1 : {467}

1 : {468}

1 : {469}

1 : {470}

1 : {471}

1 : {472}

1 : {473}

1 : {474}

1 : {475}

1 : {476}

1 : {477}

1 : {481}

1 : {484}

1 : {486}

1 : {487}

1 : {488}

1 : {489}

1 : {490}

1 : {491}

1 : {492}

1 : {493}

1 : {494}

1 : {495}

1 : {496}

1 : {497}

1 : {498}

1 : {499}

1 : {500}


