In [None]:
'''
AUTHOR: Adam Kim
TITLE:  Assignment 5, Clustering by DBSCAN
DATE:   October 23, 2017
'''

import os        as myOS    # get current directory
import csv       as myCSV   # handle initial file input

DIMS = 12
NUMOBS = 500

In [None]:
'''
This funciton returns a dictionary of genes, where key is the
    row number and value is list containing 12 gene times
'''

def init_DB(filename):

    handle      = open(filename,newline='')
    csvHandle   = myCSV.reader(handle,delimiter='\t',quotechar='"')

    geneDB      = []
    rowCount    = 0

    for row in csvHandle:

        newRow = []

        for point in row:
            
            if point:

                newRow.append(float(point))

        geneDB.append(newRow)

        assert(len(newRow) == DIMS)

    assert(len(geneDB) == NUMOBS)

    return geneDB

In [None]:
'''
This function returns distance matrix.

Row i contains distance information from gene i to genes 0 to 500.

Col i contains distance information from gene i to genes 0 to 500.

Distance matrix has 0s on diagonal because it measures dist from gene i to i

'''

def init_distance_matrix(geneDB):

    distMatrix = []

    i = 0

    while i < len(geneDB):

        j = 0

        row = []

        while j < len(geneDB):

            if i != j:

                rootTimes     = geneDB[i]
                neighborTimes = geneDB[j]

                dist = manhattan(rootTimes,neighborTimes)
                
                row.append(dist)
                
            else: # Genes are same, so distance is 0

                row.append(0.0)

            j+=1

        distMatrix.append(row)

        assert(len(row) == NUMOBS) # check rows

        i+=1

    assert(len(distMatrix) == NUMOBS) # check columns
        
    return distMatrix

In [None]:
'''
This function returns manhattan distance between 2 genes.
The arguments are lists containing times for 2 genes.
'''

def manhattan(timeListA,timeListB):

    assert( len(timeListA) == DIMS)
    assert( len(timeListA) == len(timeListB) )

    manhattanDist   = 0.0
    
    itr = 0

    while itr < DIMS:

        manhattanDist += abs( timeListA[itr] - timeListB[itr] )

        itr += 1

    return manhattanDist

In [None]:
def DBSCAN(eps,minPoints,distMatrix,geneDB):

    gene = 0

    mp = minPoints

    clusters = [] # list of lists

    outliers = [] # list of ints
    notVisited = list(range(NUMOBS))

    while gene < len(geneDB):

        if gene in notVisited:

            notVisited.remove(gene)

            nb = scanRegion(gene, eps, distMatrix)

            if isCore(nb,minPoints): # returns true if gene is a core

                newCl = generate(gene,nb,eps,mp,distMatrix,notVisited,outliers)

                clusters.append(newCl)

            else: # outlier found here

                outliers.append(gene)
                
        gene+=1

    assert( notVisited == [] )

    return clusters,outliers

In [None]:
'''
isCore returns true if a point has minPoints points within an epsilon radius
and returns false otherwise
'''
        
def isCore(neighborList, minPoints):

    if len(neighborList) >= minPoints:
        return True
    else:
        return False

In [None]:
ef generate(root,neighbors,eps,minPoints,distMatrix,notVisited,outliers):

    # root is first core discovered
    # neighbors are border/core points in first core radius

    # eps is search radius
    # minPoints is min number of points to be considered core

    # distMatrix contains distance info
    # notVisited contains unvisited nodes
    # outliers contains outliers

    newCluster = [root]

    j = 0 

    while j < len(neighbors):

        neighborGene = neighbors[j]

        if neighborGene in outliers:
            
            outliers.remove(neighborGene)
            newCluster.append(neighborGene)

        elif neighborGene in notVisited:

            newCluster.append(neighborGene)
            notVisited.remove(neighborGene)

            localNeighbors = scanRegion(neighborGene,eps, distMatrix)

            if isCore(localNeighbors,minPoints): 

                safeMerge(neighbors,localNeighbors) 
            
        j+=1

    return newCluster

In [None]:
''
During cluster generation, make sure that no nodes are pushed onto stack
that are already there.  Also ensure that neighbor nodes are pushed onto the 
end of the stack and not the beginning.
'''            

def safeMerge(masterList,proposedList):

    masterSet = set(masterList)
    proposedSet = set(proposedList)

    uniqueSet = proposedSet - masterSet
    
    uniqueList = list(uniqueSet)

    for item in uniqueList:

        masterList.append(item) # add to end

In [None]:
'''
Scan an epsilon radius around a point and return all points within radius.
IMPORTANT: this includes the point itself, so scanRegion will always return
 at least itself
'''

def scanRegion(i, epsilon, distMatrix):

    i_distances = distMatrix[i]

    neighbors = []

    itr = 0

    while itr < len(i_distances):

        if i_distances[itr] < epsilon:

            neighbors.append(itr)

        itr += 1

    return neighbors   

In [None]:
# Driver for AGNES Algorithm.

def main():

    filename = fetch_filename()
    geneDB = init_DB(filename)
    distMatrix = init_distance_matrix(geneDB)

    # list for epsilon and minPoints
    epsList   = [1,2,3,4]
    minPtsList = [2,4,6]

    for epsilon in epsList:

        for minPoints in minPtsList:

            clusters,outliers = DBSCAN(epsilon,minPoints,distMatrix,geneDB)

            # correct off by one in CLUSTERS
            for cluster in clusters:
                i = 0
                while i < len(cluster):
                    cluster[i] +=1
                    i+=1

            # correct off by one in OUTLIERS
            j = 0
            while j < len(outliers):
                outliers[j] += 1
                j+=1

            print('\n')
            print('For Epsilon   =',epsilon)
            print('For minPoints =',minPoints)
            print('')

            print('OUTLIERS\t:\t',len(outliers))

            i = 0
            for cluster in sorted(clusters,key=len):

                print('Cluster',i,'\t:\t',len(cluster))
                i+=1

            myFilename =  'e' + '' + str(epsilon)
            myFilename += 'm' + str(minPoints)  
            myFilename += '.txt'

            # open output file

            outFile = open(myOS.path.abspath(myFilename),'w')

            safetyCheck = 0

            # print outliers to output file

            for outlier in outliers:

                cluster = [outlier]

                sentence = str(len(cluster)) + ' : ' + stringify(cluster)

                safetyCheck += len(cluster) # check

                outFile.write(sentence + '\n')
                outFile.write('\n')

            # print clusters to output file

            for cluster in sorted(clusters,key=len):

                sentence = str(len(cluster)) + ' : ' + stringify(cluster)

                safetyCheck += len(cluster) # check

                outFile.write(sentence + '\n')
                outFile.write('\n')


            outFile.close()

################################################################################

'''
Force output to match specifications.
'''

def stringify(mylist):

    strList = str(mylist)
    
    strList = strList.replace('[','{').replace(']','}')

    return strList

################################################################################

# Python shenanigans.

if __name__ == "__main__":
    main()