<a href="https://colab.research.google.com/github/anandraiyer/access_forums_eval/blob/main/OmegaScore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
import math
from itertools import combinations

In [9]:
def get_all_posts(s1,s2={'E':{}}):
# This function returns a set of all the posts in the union of all the input clusters in the two input clusterings.
# Input : 
# s1 : clustering, a dictionary representing a set of sets
# s2 : (optional) clustering, a dictionary representing a set of sets. 
# Output :
# u : a universal set with all the posts in the thread
  u = set([])
  for _,v in s1.items():
    u = u.union(set(v))
  for _,v in s2.items():
    u = u.union(set(v))
  return(u)

In [10]:
def create_contingency_table_for_omega_index(s1,s2):
# This function creates the contingency table used to calculate the omega index
# Input : 
# s1 : clustering, a dictionary representing a set of sets.
# s2 : clustering, a dictionary representing a set of sets. 
# Output :
# table : numpy nd-array with contingency table for omega index calculation.
# sum_rows : numpy array with sum of each row in contingency table.
# sum_cols : numpy array with sum of each column in contingency table.
# agreement : numpy array with diagonal elements of the contingency table.
# J : Number of clusters in input s1 
# K : Number of clusters in input s2

  posts = get_all_posts(s1,s2)
  n = len(posts)
  all_pairs = list(combinations(posts,2)) # get all nC2 pairs of posts. 
  if len(all_pairs) == (n*(n-1.0)/2.0):
    s1_count = {}
    s2_count = {}
    cluster1_count = {}
    cluster2_count = {}
    J = len(s1.items())
    K = len(s2.items())
    
    for (a,b) in all_pairs:
      counter = 0
      for _,si in s1.items():
        if (a in si) and (b in si):
          #Count all pairs that occur together in the same cluster
          counter = counter + 1
      s1_count[(a,b)] = counter
      if counter in cluster1_count:
        #Dictionary with Count of Clusters in s1 that pairs occur together in and set of the pairs
        #eg - {1 : {(1,2),(3,4)}} means the pairs (1,2) and (3,4) occur together in 1 cluster in s1.
        cluster1_count[counter] = cluster1_count.get(counter).union(set([(a,b)]))
      else:
        cluster1_count[counter] = set([(a,b)])
      counter = 0
      for _,si in s2.items():
        if (a in si) and (b in si):
          counter = counter + 1
      s2_count[(a,b)] = counter
      if counter in cluster2_count:
        #Dictionary with Count of Clusters in s2 that pairs occur together in and set of the pairs
        #eg - {1 : {(1,2),(3,4)}} means the pairs (1,2) and (3,4) occur together in 1 cluster in s2.
        cluster2_count[counter] = cluster2_count.get(counter).union(set([(a,b)]))
      else:
        cluster2_count[counter] = set([(a,b)])
    
    table = []
    agreement = []

  
    for j in range(J+1):
      table_row = []
      for k in range(K+1):
        #Get a count of all pairs of posts that occur together in j clusters in s1 and k clusters in s2. 
        n_jk = len(cluster1_count.get(j,set([])).intersection(cluster2_count.get(k,set([]))))
        if j == k:
          #Diagonal Elements are added to Agreement List
          agreement.append(n_jk)
        table_row.append(n_jk)
      #Constructing the Contingency Table one row at a time  
      table.append(table_row)
    table = np.array(table)
    #Get Row and Column Sum of Contingency Table
    sum_rows = np.sum(table, axis = 1)
    sum_cols = np.sum(table, axis = 0)

    return (table,sum_rows,sum_cols,agreement,J,K)
  else:
    PRINT("ERROR")

In [36]:
def get_omega_score(s1,s2):
# Extension of Adjusted Rand Index for soft clusters
# Refer : https://drive.google.com/file/d/1Mm8TI8870uxhVFuzJjw41UjQMjAiOSf7/view?usp=sharing
# Input : 
# s1 : clustering, a dictionary representing a set of sets.
# s2 : clustering, a dictionary representing a set of sets. 
# Output :
# omega_score : similarity between clusterings s1 and s2.
  
  #Get table, rowsum, colsum, agreement and J and K indices from s1 and s2 using helper function.
  table, rowsum, colsum,agreement,J,K = create_contingency_table_for_omega_index(s1,s2)
  rowN = np.sum(rowsum)
  colN = np.sum(colsum)
  if rowN == colN:
    N = rowN
  else:
    print('Error',rowN, colN,s1,s2,table)
  print('Contingency Table,', 'Row Sums,', 'Column Sums,', 'Agreement,', 'N,', 'J,', 'K')
  print(table, rowsum, colsum,agreement,N,J,K)
  min_jk = min(J,K) #Minimum from J and K for upper bound on Summation
  #Calculate Unadjusted Rand Index
  unadjusted_rand_index = 0.0
  for i in range(min_jk+1):
    unadjusted_rand_index = unadjusted_rand_index + (agreement[i] / N)
  #Calculate Expected Rand Index
  expected_rand_index = 0.0
  for i in range(min_jk+1):
    expected_rand_index = expected_rand_index + ((rowsum[i] * colsum[i]) / (N * N))
  #Calculate Omega Index
  omega_score = (unadjusted_rand_index - expected_rand_index)/(1 - expected_rand_index)
  print('Unadjusted Rand Index',unadjusted_rand_index)
  print('Expected Rand Index',expected_rand_index)
  print('Unadjusted Rand Index - Expected Rand Index = ', unadjusted_rand_index - expected_rand_index)
  print('1 - expected_rand_index = ',1 - expected_rand_index)
  print('Omega Score : ', omega_score)
  if omega_score < 0 :
    #No similarity between s1 and s2 
    omega_score = 0.0
  return round(omega_score,2)

In [37]:
gold = {'C1':{1,2,3,4},'C2':{4,5,6,7},'C3':{8,9},'C4':{10}}
pred = {'D1':{1,2,3,4},'D2':{3,4,5,6,7},'D3':{8,9,10}}
#Example in the paper , Output is 0.71
omega = get_omega_score(gold,pred)

Contingency Table, Row Sums, Column Sums, Agreement, N, J, K
[[27  5  0  0]
 [ 0 12  1  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0  0]] [32 13  0  0  0] [27 17  1  0] [27, 12, 0, 0] 45 4 3
Unadjusted Rand Index 0.8666666666666667
Expected Rand Index 0.5358024691358025
Unadjusted Rand Index - Expected Rand Index =  0.33086419753086416
1 - expected_rand_index =  0.46419753086419746
Omega Score :  0.7127659574468085


In [38]:
print(omega)

0.71


In [39]:
omega = get_omega_score(gold,gold)

Contingency Table, Row Sums, Column Sums, Agreement, N, J, K
[[32  0  0  0  0]
 [ 0 13  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]] [32 13  0  0  0] [32 13  0  0  0] [32, 13, 0, 0, 0] 45 4 4
Unadjusted Rand Index 1.0
Expected Rand Index 0.5891358024691358
Unadjusted Rand Index - Expected Rand Index =  0.41086419753086423
1 - expected_rand_index =  0.41086419753086423
Omega Score :  1.0


In [40]:
print(omega)

1.0


In [41]:
omega = get_omega_score(gold,{'E1':{}})

Contingency Table, Row Sums, Column Sums, Agreement, N, J, K
[[32  0]
 [13  0]
 [ 0  0]
 [ 0  0]
 [ 0  0]] [32 13  0  0  0] [45  0] [32, 0] 45 4 1
Unadjusted Rand Index 0.7111111111111111
Expected Rand Index 0.7111111111111111
Unadjusted Rand Index - Expected Rand Index =  0.0
1 - expected_rand_index =  0.28888888888888886
Omega Score :  0.0


In [35]:
print(omega)

0.0
