In [1]:
import unittest

from nose.tools import *

from py_valuenormalization import MyPriorityQueue
from py_valuenormalization import HierarchicalClustering
from py_valuenormalization import SimMeasureNotSupportedException
from py_valuenormalization.value_normalization_misc import Utils

class HybridClusteringTests(unittest.TestCase):
    def setUp(self):
        self.vals = ['Haldis restaurant', 'Haldis cafe', 'Indian cuisine', 'India house', 'Bombay bazaar', 'Tulsi', 'Amber restaurant']
        self.vals2 = ['Arrow Shed', 'Arrow','Arta','Arte italica','cooper cooler','cooper caseys','caseys','florida','floris','heineken','highland','metra','metro']
        self.vals3 = ['Apple Incorporated', 'Apple', 'Apple Inc', 'Amazon', 'Juniper Networks', 'Mathworks', 'Matlab', 'Cisco Networks', 'A10 networks', 'Zendesk', 'Zenith']
        self.vals4 = ['Raghu', 'Ram', 'John', 'Johnson', 'Sorenson', 'Sorensen']
        self.vals5 = ['Dept of computersciences', 'Computer sciences department', 'Johnson street', 'Johnson st', 'Wall street', 'Univ avenue', 'University of wisc madison']
        self.vals6 = ['University of wisc madison', 'Michigan st', 'Michigan ann arbor', 'UW Madison', 'UM ann arbor', 'UW', 'UNL']
        self.val_to_clustid_map = {}
        self.costmodel = Utils.get_default_cost_model()
    
    def test_vals3(self):
        self.hybhac = HybridClustering(self.vals3, self.costmodel)
        
        self.dists = self.hybhac.calc_dists('3gram Jaccard')
        self.assertAlmostEqual(min(self.dists.values()), 0.590909090)
        
        #testcase to check with default settings -> sim_measure - 3gram Jaccard, thr = 0.7, linkage = single, max_clust_size = length(inputs)
        dend_hist = self.hybhac.shotgun_create_dendrogram()
        
        self.assertEqual(dend_hist[0][1][0], {1: ['Zenith'],
 2: ['Zendesk'],
 3: ['Matlab'],
 4: ['Mathworks'],
 5: ['Juniper Networks'],
 6: ['Cisco Networks'],
 7: ['Apple Incorporated'],
 8: ['Apple Inc'],
 9: ['Apple'],
 10: ['Amazon'],
 11: ['A10 networks']})      
        self.assertEqual(dend_hist[0][1][1], [])
        
        self.assertEqual(dend_hist[0][2][0], {1: ['Zenith'],
 2: ['Zendesk'],
 3: ['Matlab'],
 4: ['Mathworks'],
 5: ['Juniper Networks'],
 6: ['Cisco Networks'],
 7: ['Apple Incorporated', 'Apple Inc'],
 9: ['Apple'],
 10: ['Amazon'],
 11: ['A10 networks']})
        self.assertEqual(dend_hist[0][2][1], [((['Apple Incorporated'], ['Apple Inc']), 0.5909090909090908)])
        
        self.assertEqual(dend_hist[0][3][0], {1: ['Zenith'],
 2: ['Zendesk'],
 3: ['Matlab'],
 4: ['Mathworks'],
 5: ['Juniper Networks', 'Cisco Networks', 'A10 networks'],
 7: ['Apple Incorporated', 'Apple Inc', 'Apple'],
 10: ['Amazon']})
        self.assertEqual(dend_hist[0][3][1], [((['Apple Incorporated'], ['Apple Inc']), 0.5909090909090908),
 ((['Apple Incorporated', 'Apple Inc'], ['Apple']), 0.6153846153846154),
 ((['Juniper Networks'], ['Cisco Networks']), 0.64),
 ((['Juniper Networks', 'Cisco Networks'], ['A10 networks']),
  0.6956521739130435)])
        
        
        #checking continue_from_dendrogram for all maxclustsizes from 1 to max of dend_hist
        
        valmap = self.hybhac.shotgun_lambdahac_continue_from_dendrogram(1)
        self.assertEqual(valmap, {'A10 networks': 11,
 'Amazon': 10,
 'Apple': 9,
 'Apple Inc': 8,
 'Apple Incorporated': 7,
 'Cisco Networks': 6,
 'Juniper Networks': 5,
 'Mathworks': 4,
 'Matlab': 3,
 'Zendesk': 2,
 'Zenith': 1})
        
        valmap = hybhac.shotgun_lambdahac_continue_from_dendrogram(2)
        self.assertEqual(valmap, {'A10 networks': 11,
 'Amazon': 10,
 'Apple': 9,
 'Apple Inc': 7,
 'Apple Incorporated': 7,
 'Cisco Networks': 5,
 'Juniper Networks': 5,
 'Mathworks': 4,
 'Matlab': 3,
 'Zendesk': 2,
 'Zenith': 1})
        
        valmap = hybhac.shotgun_lambdahac_continue_from_dendrogram(3)
        self.assertEqual(valmap, {'A10 networks': 5,
 'Amazon': 10,
 'Apple': 7,
 'Apple Inc': 7,
 'Apple Incorporated': 7,
 'Cisco Networks': 5,
 'Juniper Networks': 5,
 'Mathworks': 4,
 'Matlab': 3,
 'Zendesk': 2,
 'Zenith': 1})
        
        ##testcase to check the best clusters returned
        (clusters, maxclustsize) = hybhac.cluster()
        self.assertEqual(clusters, {'A10 networks': ['A10 networks', 'Cisco Networks', 'Juniper Networks'],
 'Amazon': ['Amazon'],
 'Apple': ['Apple', 'Apple Inc', 'Apple Incorporated'],
 'Mathworks': ['Mathworks'],
 'Matlab': ['Matlab'],
 'Zendesk': ['Zendesk'],
 'Zenith': ['Zenith']})
        
        self.assertEqual(maxclustsize, 3)
        
        
    def test_vals(self):
        self.hybhac = HybridClustering(self.vals, self.costmodel)
        
        self.dists = self.hybhac.calc_dists('Jaro-Winkler')
        self.assertAlmostEqual(min(self.dists.values()), 0.1077922077)
        
        #testcase to check with  sim_measure - Jaro Winkler, thr = 0.5, linkage = single, max_clust_size = 5
        dend_hist = self.hybhac.shotgun_create_dendrogram(sim_measure_str = 'Jaro-Winkler', thr = 0.5, max_clust_size = 5)
        
        self.assertEqual(dend_hist[0][1][0], {1: ['Tulsi'],
 2: ['Indian cuisine'],
 3: ['India house'],
 4: ['Haldis restaurant'],
 5: ['Haldis cafe'],
 6: ['Bombay bazaar'],
 7: ['Amber restaurant']})      
        self.assertEqual(dend_hist[0][1][1], [])
        
        self.assertEqual(dend_hist[0][2][0], {1: ['Tulsi'],
 2: ['Indian cuisine', 'India house'],
 4: ['Haldis restaurant', 'Haldis cafe'],
 6: ['Bombay bazaar'],
 7: ['Amber restaurant']})
        self.assertEqual(dend_hist[0][2][1], [((['Indian cuisine'], ['India house']), 0.10779220779220777),
 ((['Haldis restaurant'], ['Haldis cafe']), 0.15270350564468216)]))
        
        self.assertEqual(dend_hist[0][4][0], {1: ['Tulsi'],
 2: ['Indian cuisine', 'India house'],
 4: ['Haldis restaurant', 'Haldis cafe', 'Amber restaurant'],
 6: ['Bombay bazaar']})
        self.assertEqual(dend_hist[0][4][1], [((['Indian cuisine'], ['India house']), 0.10779220779220777),
 ((['Haldis restaurant'], ['Haldis cafe']), 0.15270350564468216),
 ((['Haldis restaurant', 'Haldis cafe'], ['Amber restaurant']),
  0.3430258467023174)])
        
        self.assertEqual(dend_hist[0][5][0], {1: ['Tulsi'],
 2: ['Indian cuisine',
  'India house',
  'Haldis restaurant',
  'Haldis cafe',
  'Amber restaurant'],
 6: ['Bombay bazaar']})
        self.assertEqual(dend_hist[0][5][1], [((['Indian cuisine'], ['India house']), 0.10779220779220777),
 ((['Haldis restaurant'], ['Haldis cafe']), 0.15270350564468216),
 ((['Haldis restaurant', 'Haldis cafe'], ['Amber restaurant']),
  0.3430258467023174),
 ((['Indian cuisine', 'India house'],
   ['Haldis restaurant', 'Haldis cafe', 'Amber restaurant']),
  0.41414141414141425)])
           
        
        #checking continue_from_dendrogram for all maxclustsizes from 1 to max of dend_hist
        
        valmap = self.hybhac.shotgun_lambdahac_continue_from_dendrogram(1)
        self.assertEqual(valmap, {'Amber restaurant': 7,
 'Bombay bazaar': 6,
 'Haldis cafe': 5,
 'Haldis restaurant': 4,
 'India house': 3,
 'Indian cuisine': 2,
 'Tulsi': 1})
        
        valmap = hybhac.shotgun_lambdahac_continue_from_dendrogram(2)
        self.assertEqual(valmap, {'Amber restaurant': 6,
 'Bombay bazaar': 6,
 'Haldis cafe': 4,
 'Haldis restaurant': 4,
 'India house': 2,
 'Indian cuisine': 2,
 'Tulsi': 1})
        
        valmap = hybhac.shotgun_lambdahac_continue_from_dendrogram(3)
        self.assertEqual(valmap, {'Amber restaurant': 4,
 'Bombay bazaar': 6,
 'Haldis cafe': 4,
 'Haldis restaurant': 4,
 'India house': 2,
 'Indian cuisine': 2,
 'Tulsi': 1})
        
        valmap = hybhac.shotgun_lambdahac_continue_from_dendrogram(4)
        self.assertEqual(valmap, {'Amber restaurant': 4,
 'Bombay bazaar': 6,
 'Haldis cafe': 4,
 'Haldis restaurant': 4,
 'India house': 2,
 'Indian cuisine': 2,
 'Tulsi': 1})
        
        
        valmap = hybhac.shotgun_lambdahac_continue_from_dendrogram(5)
        self.assertEqual(valmap, {'Amber restaurant': 2,
 'Bombay bazaar': 6,
 'Haldis cafe': 2,
 'Haldis restaurant': 2,
 'India house': 2,
 'Indian cuisine': 2,
 'Tulsi': 1})
        
        ##testcase to check the best clusters returned
        (clusters, maxclustsize) = hybhac.cluster()
        self.assertEqual(clusters, {'Amber restaurant': ['Amber restaurant',
  'Haldis cafe',
  'Haldis restaurant',
  'India house',
  'Indian cuisine'],
 'Bombay bazaar': ['Bombay bazaar'],
 'Tulsi': ['Tulsi']})
        
        self.assertEqual(maxclustsize, 5)
        
        