-
Notifications
You must be signed in to change notification settings - Fork 0
/
Part3.py
104 lines (89 loc) · 3.59 KB
/
Part3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import time
import math
#Gets the number of total unique topics among all clusters in a given clustering method
def extractUniqueTopics(clusteringMethod):
uniqueTopics={}
for doc in clusteringMethod:
topics = doc['topics']
for topic in topics:
if topic not in uniqueTopics:
uniqueTopics[topic]=0
uniqueTopics[topic]+=1
return uniqueTopics
#Gets all of the documents in each cluster
def getDocsByCluster(clustering):
allClusters={}
docCount=0
for document in clustering:
if document['topics']!='':
docCount+=1
for cluster in document['clusters']:
if cluster not in allClusters:
allClusters[cluster]=[]
allClusters[cluster].append(document)
return allClusters, docCount
#Removes clusters from list that contain greater more documents than the proportion
def cleanupClusters(clusterList, proportion, docCount):
removeValues=[]
for cluster in clusterList:
if len(clusterList[cluster])>proportion*docCount:
removeValues.append(cluster)
for value in removeValues:
del clusterList[value]
return clusterList
#Gets the number of documents of each topic in a cluster
def getTopicCountByCluster(cluster):
topics={}
for doc in cluster:
for topic in doc['topics']:
if topic not in topics:
topics[topic]=0
topics[topic]+=1
return topics
#Returns the topics with the highest and second highest number of documents
def getMaxAndSecondMaxTopics(clusterTopics):
maxKey=''
maxCount=0
secondMaxKey=''
secondMaxCount=0
for topic in clusterTopics:
if clusterTopics[topic] > maxCount:
secondMaxKey = maxKey
secondMaxCount = maxCount
maxCount=clusterTopics[topic]
maxKey=topic
elif clusterTopics[topic] > secondMaxCount:
secondMaxCount = clusterTopics[topic]
secondMaxKey = topic
return maxKey, maxCount, secondMaxKey, secondMaxCount
#Calculates the similarity at each cluster by using the topics
#Needs input from getTopicCountByCluster Function
#Maximum Topic Count/ 2nd highest topic Count * [log2(Total Number of documents in cluster)+1]
def calculateSimilarityAtOneCluster(cluster, numDocsInCluster):
maxKey, maxCount, secondMaxKey, secondMaxCount = getMaxAndSecondMaxTopics(cluster)
secondMaxCount = 1 if secondMaxCount==0 else secondMaxCount
return maxCount * (math.log(numDocsInCluster, 2) +1) / secondMaxCount
#Sums up the similarities for all the clusters then divides by some normalizing factor?
def evaluate(clusteringMethod):
clustersWithDocuments, docCount = getDocsByCluster(clusteringMethod)
removedHigherLevelClusters = cleanupClusters(clustersWithDocuments, 0.5, docCount)
totalSimilarity=0
for clusterID in removedHigherLevelClusters:
topicForCluster = getTopicCountByCluster(removedHigherLevelClusters[clusterID])
totalSimilarity+=calculateSimilarityAtOneCluster(topicForCluster, len(removedHigherLevelClusters[clusterID]))
return totalSimilarity/docCount
def part3(singleClustering, completeClustering):
startTime = time.time()
runningTotalTime=0
print("Executing code for Part 3...\n")
print("Evaluating Single and Complete Link Clustering...")
singleEvalScore = evaluate(singleClustering)
completeEvalScore=evaluate(completeClustering)
print(f'Single Linkage Score: {singleEvalScore}')
print(f'Complete Linkage Score: {completeEvalScore}')
evalTime = round(time.time() - startTime, 3)
runningTotalTime+=evalTime
print("Time: " + str(evalTime) + " seconds")
print('\nPart 3 Complete')
print("Execution Time: " + str(round(time.time() - startTime, 3)) + " seconds\n")
return singleEvalScore, completeEvalScore