In [45]:
 import math

In [202]:
"""
Cluster class for Module 3
"""

import math


class Cluster:
    """
    Class for creating and merging clusters of counties
    """
    
    def __init__(self, fips_codes, horiz_pos, vert_pos, population, risk):
        """
        Create a cluster based the models a set of counties' data
        """
        self._fips_codes=[]
        self._fips_codes.append(fips_codes) 
        self._horiz_center = horiz_pos
        self._vert_center = vert_pos
        self._total_population = population
        self._averaged_risk = risk
        
        
    def __repr__(self):
        """
        String representation assuming the module is "alg_cluster".
        """
        rep = "alg_cluster.Cluster("
        rep += str(self._fips_codes) + ", "
        rep += str(self._horiz_center) + ", "
        rep += str(self._vert_center) + ", "
        rep += str(self._total_population) + ", "
        rep += str(self._averaged_risk) + ")"
        return rep


    def fips_codes(self):
        """
        Get the cluster's set of FIPS codes
        """
        return self._fips_codes
    
    def horiz_center(self):
        """
        Get the averged horizontal center of cluster
        """
        return self._horiz_center
    
    def vert_center(self):
        """
        Get the averaged vertical center of the cluster
        """
        return self._vert_center
    
    def total_population(self):
        """
        Get the total population for the cluster
        """
        return self._total_population
    
    def averaged_risk(self):
        """
        Get the averaged risk for the cluster
        """
        return self._averaged_risk
   
        
    def copy(self):
        """
        Return a copy of a cluster
        """
        copy_cluster = Cluster(list(self._fips_codes), self._horiz_center, self._vert_center,
                               self._total_population, self._averaged_risk)
        return copy_cluster


    def distance(self, other_cluster):
        """
        Compute the Euclidean distance between two clusters
        """
        vert_dist = self._vert_center - other_cluster.vert_center()
        horiz_dist = self._horiz_center - other_cluster.horiz_center()
        return math.sqrt(vert_dist ** 2 + horiz_dist ** 2)
        
    def merge_clusters(self, other_cluster):
        """
        Merge one cluster into another
        The merge uses the relatively populations of each
        cluster in computing a new center and risk
        
        Note that this method mutates self
        """
        if len(other_cluster.fips_codes()) == 0:
            return self
        else:
            self._fips_codes.append(other_cluster.fips_codes())
 
            # compute weights for averaging
            self_weight = float(self._total_population)                        
            other_weight = float(other_cluster.total_population())
            self._total_population = self._total_population + other_cluster.total_population()
            self_weight /= self._total_population
            other_weight /= self._total_population
                    
            # update center and risk using weights
            self._vert_center = self_weight * self._vert_center + other_weight * other_cluster.vert_center()
            self._horiz_center = self_weight * self._horiz_center + other_weight * other_cluster.horiz_center()
            self._averaged_risk = self_weight * self._averaged_risk + other_weight * other_cluster.averaged_risk()
            return self

    def cluster_error(self, data_table):
        """
        Input: data_table is the original table of cancer data used in creating the cluster.
        
        Output: The error as the sum of the square of the distance from each county
        in the cluster to the cluster center (weighted by its population)
        """
        # Build hash table to accelerate error computation
        fips_to_line = {}
        for line_idx in range(len(data_table)):
            line = data_table[line_idx]
            fips_to_line[line[0]] = line_idx
        
        # compute error as weighted squared distance from counties to cluster center
        total_error = 0
        counties = self.fips_codes()
        for county in counties:
            line = data_table[fips_to_line[county]]
            singleton_cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4])
            singleton_distance = self.distance(singleton_cluster)
            total_error += (singleton_distance ** 2) * singleton_cluster.total_population()
        return total_error
            
        
            

        
    
    
            

In [133]:
DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"

In [134]:
DATA_3108_URL = DIRECTORY + "data_clustering/unifiedCancerData_3108.csv"

In [135]:
url=DATA_3108_URL 
import urllib.request
def load_data(url):
    with urllib.request.urlopen(url) as url :
        s=url.read()
        data=s.decode('utf8')
        data=data.split('\r\n')
        data_tokens=[line.split(", ")for  line in data]
        return [[tokens[0], float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])] 
                for tokens in data_tokens]

In [136]:
point1=load_data(url)[1]
print(point1)
point2=load_data(url)[2]
print(point2)

['01067', 741.064829551, 454.67645286, 16310, 3.4e-05]
['01061', 730.413538241, 465.838757711, 25764, 3.8e-05]


In [203]:
c1=Cluster(point1[0],point1[1],point1[2],point1[3],point1[4])
c1.__repr__()

"alg_cluster.Cluster(['01067'], 741.064829551, 454.67645286, 16310, 3.4e-05)"

In [204]:
c2=Cluster(point2[0],point2[1],point2[2],point2[3],point2[4])
c2.__repr__()

"alg_cluster.Cluster(['01061'], 730.413538241, 465.838757711, 25764, 3.8e-05)"

In [205]:
c1.distance(c2)

15.42877364397901

In [240]:
c3=c1.merge_clusters(c2)

In [207]:
c1._fips_codes

['01067', ['01061']]

In [231]:
import  math
def slow_closest_pair(cluster_list):
    minimum_point=(math.inf,-1,-1)
    for  index1 in range(0,len(cluster_list)):
        for index2 in  range(0,len(cluster_list)):
            #if cluster_list[index1]._fips_codes!=cluster_list[index2]._fips_codes:
            if index1<index2:
                
                if cluster_list[index1].distance(cluster_list[index2])<minimum_point[0]:
                    minimum_point=(cluster_list[index1].distance(cluster_list[index2]),index1,index2)
        
    return minimum_point

In [232]:
minimum_point=(math.inf,-1,-1)

In [242]:
cluster_list=[c1,c2,c3]

In [243]:
slow_closest_pair(cluster_list)

(0.0, 0, 2)

In [237]:
c1.__repr__()

"alg_cluster.Cluster(['01067', ['01061']], 734.5425148361917, 461.5116865478158, 42074, 3.6449398678518803e-05)"

In [244]:
c2.__repr__()

"alg_cluster.Cluster(['01061'], 730.413538241, 465.838757711, 25764, 3.8e-05)"

In [239]:
c1.distance(c2)

5.980969200296992

In [245]:
c3

alg_cluster.Cluster(['01067', ['01061'], ['01061']], 732.974382624179, 463.1550525292463, 67838, 3.7038297119608485e-05)

In [247]:
c1.distance(c3)

0.0