<a href="https://colab.research.google.com/github/anandraiyer/access_forums_eval/blob/main/testing_framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import math
from itertools import combinations
from itertools import chain
from itertools import product
from itertools import starmap
from functools import partial
import networkx as nx

In [2]:
#The ground truth is stored here
data = pd.read_csv('/content/drive/MyDrive/final.csv')

In [3]:
data.columns

Index(['Thread', 'DateTime', 'Author', 'Post', 'ParentPosts', 'PostID',
       'ThreadID', 'AuthorID', 'OriginID', 'DialogAct', 'ParentID_List'],
      dtype='object')

In [4]:
df = data[['Thread','ThreadID','PostID','DateTime','Author','Post','ParentID_List']]

In [5]:
thread_pid_author_dic = {}
for i in range(max(df['ThreadID'])+1):
  temp = df[df['ThreadID']==i]
  authors = list(temp['Author'])
  if (len(authors)> 0):
    for j in range(len(authors)):
      thread_pid_author_dic[(i,j)] = authors[j]

In [6]:
def get_conversation_dag(tid):
  #Returns the Directed Acyclic Graph representing the Thread. Here nodes are messages and edges capture the 'reply to' relationship.
  #Use this function to calculate metrics using networkx library on graphs downstream.
  temp = df[df['ThreadID']==tid]

  threads = list(temp['Thread'])
  postids = list(temp['PostID'])
  posts = list(temp['Post'])
  datetimes = list(temp['DateTime'])
  authors = list(temp['Author'])
  parentids = list(temp['ParentID_List'])

  G = nx.DiGraph()
  G.add_nodes_from(posts)

  edges = []
  for i in range(len(postids)):
    parent = parentids[i]
    child = postids[i]

    if parent == "-1":
      continue
    else:
      if parent == "[]":
        if i > 1:
          parent = "["+str(postids[i-1])+"]"
        else:
          parent = "["+str(postids[0])+"]"
      parent = int(parent[1:-1])
    edges.append((parent,child))
  G.add_edges_from(edges)
  return G

In [7]:
def get_subthreads(tid):
  #Get a list of lists. Given a thread id get all the subthreads.
  temp = df[df['ThreadID']==tid]

  threads = list(temp['Thread'])
  postids = list(temp['PostID'])
  if len(postids)<2:
    return (postids)
  posts = list(temp['Post'])
  datetimes = list(temp['DateTime'])
  authors = list(temp['Author'])
  parentids = list(temp['ParentID_List'])

  G = nx.DiGraph()
  G.add_nodes_from(posts)

  edges = []
  for i in range(len(postids)):
    parent = parentids[i]
    child = postids[i]

    if parent == "-1":
      continue  
    else:
      if parent == "[]":
        if i > 1:
          parent = "["+str(postids[i-1])+"]"
        else:
          parent = "["+str(postids[0])+"]"
      parent = int(parent[1:-1])
    edges.append((parent,child))
  G.add_edges_from(edges)
  chaini = chain.from_iterable
  roots = (n for n,d in G.in_degree() if d==0)
  leaves = (n for n,d in G.out_degree() if d==0)
  all_paths = partial(nx.all_simple_paths, G)
  ans = list(chaini(starmap(all_paths, product(roots, leaves))))
  return(ans)

In [8]:
def get_list_subthreads(cluster_name,threadID):
  conversations = []
  for i, val in enumerate(get_subthreads(threadID)):
    conversations.append(cluster_name+str(i)+":"+str(val)[1:-1].replace(',',''))
  return conversations

In [9]:
def get_n(s1,s2={'E':{}}):
  #size of dataset D. Total number of posts in a thread.
  u = set([])
  for _,v in s1.items():
    u = u.union(set(v))
  for _,v in s2.items():
    u = u.union(set(v))
  return(len(u))

In [10]:
def get_length_clustering(s):
  #Get length of each clustering within a cluster s = {s1,s2,s3}, here we return [len(s1),len(s2)...]
  list_len = []
  for _,v in s.items():
    list_len.append(len(v))
  return(list_len)

In [11]:
def get_points(s):
  #Get all points in each cluster in the clustering s
  u = set([])
  for _,v in s.items():
    u = u.union(set(v))
  return(u)

In [12]:
def get_points(s1,s2={'E':{}}):
  #Get all points in each cluster in the clustering s
  u = set([])
  for _,v in s1.items():
    u = u.union(set(v))
  for _,v in s2.items():
    u = u.union(set(v))
  return(u)

In [13]:
def create_contingency_table(gold, auto):
  #Refer https://en.wikipedia.org/wiki/Rand_index#The_contingency_table
  #https://people.eng.unimelb.edu.au/baileyj/papers/yanglei.pdf
  #Generalization for Soft Clusters : http://derektanderson.com/pdfs/05482124.pdf
  table = []
  names_gold = []
  names_auto = []
  for i, _ in gold.items():
    names_gold.append(i)
  for i, _ in auto.items():
    names_auto.append(i)
  for i, v1 in gold.items():
    table_row = []
    for j, v2 in auto.items():
      table_row.append(len(v1.intersection(v2)))
    table.append(table_row)
  table = np.array(table)
  sum_rows = np.sum(table, axis = 1)
  sum_cols = np.sum(table, axis = 0)
  n = get_n(gold)
  #Generalization for soft clusterings
  n_max = np.sum(sum_rows)
  phi = n/n_max
  table = np.multiply(phi,table)
  sum_rows = np.sum(table, axis = 1)
  sum_cols = np.sum(table, axis = 0)
  return table, sum_rows, sum_cols, names_gold, names_auto

In [14]:
def get_information_entropy(s1,s2):
  c,c_rows,c_cols,g_name, a_name = create_contingency_table(s1,s2)
  H_u = 0.0
  H_v = 0.0
  total = get_n(s1)
  for i in range(len(c_rows)):
    if c_rows[i] > 0:
      H_u = H_u - ((c_rows[i]/total) * math.log(c_rows[i]/total,2))
  for i in range(len(c_cols)):
    if c_cols[i] > 0:
      H_v = H_v - ((c_cols[i]/total) * math.log(c_cols[i]/total,2))
  return (H_u,H_v)

In [15]:
def get_mi(s1,s2):
  c,c_rows,c_cols,g_name, a_name = create_contingency_table(s1,s2)
  I_uv = 0.0
  total = get_n(s1)
  for i in range(len(c_rows)):
    for j in range(len(c_cols)):
      if c[i][j] > 0:
        I_uv = I_uv + ((c[i][j] / total) * math.log((c[i][j] * total) / (c_rows[i] * c_cols[j]), 2.0))
      else:
        continue
  return(I_uv)

In [16]:
def get_joint_entropy(s1,s2):
  c,c_rows,c_cols,g_name, a_name = create_contingency_table(s1,s2)
  H_uv = 0.0
  total = get_n(s1)
  for i in range(len(c_rows)):
    for j in range(len(c_cols)):
      if c[i][j] > 0:
        H_uv = H_uv - ((c[i][j] / total) * math.log(c[i][j] / total, 2.0))
      else:
        continue
  return(H_uv)

In [17]:
def get_vi(s1,s2):
  #Refer : Original Paper : https://sites.stat.washington.edu/mmp/Papers/compare-colt.pdf
  #Refer : Code from Kummerfeld : https://github.com/jkkummerfeld/irc-disentanglement/blob/master/tools/evaluation/conversation-eval.py
  #Refer : *IMP* Adjustment for Soft Clusterings -> https://people.eng.unimelb.edu.au/baileyj/papers/yanglei.pdf
  H_uv = get_joint_entropy(s1,s2)
  I_uv = get_mi(s1,s2)
  #H_u,H_v = get_information_entropy(s1,s2)
  VI = H_uv - I_uv
  #VI = H_u + H_v - 2*I_uv (Equivalent Formula)
  return(VI)

In [18]:
def get_one_minus_scaled_vi(s1,s2):
  c,c_rows,c_cols,g_name, a_name = create_contingency_table(s1,s2)
  VI = get_vi(s1,s2)
  n = np.sum(c_rows)
  max_score = math.log(n, 2.0)
  scaled_VI = VI / max_score
  
  return round(1 - scaled_VI,3)

In [19]:
def get_normalized_vi(s1,s2):
  H_uv = get_joint_entropy(s1,s2)
  I_uv = get_mi(s1,s2)
  NVI = 1 - (I_uv / H_uv)
  return (round(NVI,3))

In [20]:
def get_normalized_information_distance(s1,s2):
  I_uv = get_mi(s1,s2)
  H_u,H_v = get_information_entropy(s1,s2)
  nid = 1 - (I_uv / max(H_u,H_v))
  return (round(nid,3))

In [21]:
def get_nmi_joint(s1,s2):
  H_uv = get_joint_entropy(s1,s2)
  I_uv = get_mi(s1,s2)
  nmi = (I_uv / H_uv)
  return (round(nmi,3))

In [22]:
def get_nmi_max(s1,s2):
  H_u,H_v = get_information_entropy(s1,s2)
  I_uv = get_mi(s1,s2)
  nmi = (I_uv / max(H_u,H_v))
  return (round(nmi,3))

In [23]:
def get_nmi_sum(s1,s2):
  H_u,H_v = get_information_entropy(s1,s2)
  I_uv = get_mi(s1,s2)
  nmi = (I_uv / (H_u + H_v))
  return (round(nmi,3))

In [24]:
def get_nmi_sqrt(s1,s2):
  H_u,H_v = get_information_entropy(s1,s2)
  I_uv = get_mi(s1,s2)
  nmi = (I_uv / math.sqrt(H_u * H_v))
  return (round(nmi,3))

In [25]:
def get_nmi_min(s1,s2):
  H_u,H_v = get_information_entropy(s1,s2)
  I_uv = get_mi(s1,s2)
  nmi = (I_uv / min(H_u,H_v))
  return (round(nmi,3))

In [26]:
def overlap(s1,s2):
  a = set(s1)
  b = set(s2)
  oc = len(a.intersection(b)) / len(a)
  return (oc * 100)

In [27]:
def get_one_to_one(s1,s2):
  #https://aclanthology.org/P08-1095.pdf 
  s1_vertex_names = []
  s1_vertex_values = []
  s1_dic = {}
  for s1_name,s1_val in s1.items():
    s1_vertex_names.append('Gold'+s1_name)
    s1_vertex_values.append(s1_val)
    s1_dic['Gold'+s1_name] = s1_val
  s2_vertex_names = []
  s2_vertex_values = []
  s2_dic = {}
  for s2_name,s2_val in s2.items():
    s2_vertex_names.append('Pred'+s2_name)
    s2_vertex_values.append(s2_val)
    s2_dic['Pred'+s2_name] = s2_val

  B = nx.Graph()
  B.add_nodes_from(s1_vertex_names, bipartite=0)
  B.add_nodes_from(s2_vertex_names, bipartite=1)

  table = []
  for i in range(len(s1_vertex_values)):
    table_row = []
    for j in range(len(s2_vertex_values)):
      per_olap = overlap(s1_vertex_values[i],s2_vertex_values[j])
      table_row.append(per_olap)
      B.add_edge(s1_vertex_names[i],s2_vertex_names[j], weight = per_olap)
    table.append(table_row)
  table = np.array(table)

  maxmatch = nx.algorithms.matching.max_weight_matching(B)
  oto = 0.0
  for a,b in maxmatch:
    if a[0:4]=='Gold' and b[0:4]=='Gold':
      continue
    if a[0:4]=='Pred' and b[0:4]=='Pred':
      continue
    if a[0:4]!='Gold':
      temp = a 
      a = b
      b = temp
    ov = overlap(s1_dic.get(a),s2_dic.get(b))
    oto = oto + ov
  oto = oto/len(s1_vertex_names)
  return oto

In [28]:
s1 = {'C1':{1,2,3,4},'C2':{4,5,6,7},'C3':{8,9},'C4':{10}}
s2 = {'D1':{1,2,3,4},'D2':{3,4,5,6,7},'D3':{8,9,10}}
get_one_to_one(s1,s2)

75.0

In [29]:
def create_contingency_table_for_omega_index(s1,s2):
  n = get_n(s1,s2)
  all_points = get_points(s1,s2)
  all_pairs = list(combinations(all_points,2))
  if len(all_pairs) == (n*(n-1.0)/2.0):
    s1_count = {}
    s2_count = {}
    cluster1_count = {}
    cluster2_count = {}
    J = len(s1.items())
    K = len(s2.items())
    
    for (a,b) in all_pairs:
      counter = 0
      for _,si in s1.items():
        if (a in si) and (b in si):
          counter = counter + 1
      s1_count[(a,b)] = counter
      if counter in cluster1_count:
        cluster1_count[counter] = cluster1_count.get(counter).union(set([(a,b)]))
      else:
        cluster1_count[counter] = set([(a,b)])
      counter = 0
      for _,si in s2.items():
        if (a in si) and (b in si):
          counter = counter + 1
      s2_count[(a,b)] = counter
      if counter in cluster2_count:
        cluster2_count[counter] = cluster2_count.get(counter).union(set([(a,b)]))
      else:
        cluster2_count[counter] = set([(a,b)])
    
    table = []
    agreement = []

  
    for j in range(J+1):
      table_row = []
      for k in range(K+1):
        n_jk = len(cluster1_count.get(j,set([])).intersection(cluster2_count.get(k,set([]))))
        if j == k:
          agreement.append(n_jk)
        table_row.append(n_jk)
      table.append(table_row)
    table = np.array(table)
    sum_rows = np.sum(table, axis = 1)
    sum_cols = np.sum(table, axis = 0)

    return (table,sum_rows,sum_cols,agreement,J,K)
  else:
    PRINT("ERROR")

In [30]:
def get_omega_score(s1,s2):
  #Extension of Adjusted Rand Index
  #Valid for overlapping clusters
  #Refer : http://dx.doi.org/10.1207/s15327906mbr2302_6
  table, rowsum, colsum,agreement,J,K = create_contingency_table_for_omega_index(s1,s2)
  rowN = np.sum(rowsum)
  colN = np.sum(colsum)
  if rowN == colN:
    N = rowN
  else:
    print('Error',rowN, colN,s1,s2,table)
  #print(table, sum_rows, sum_cols,agreement,N,J,K)
  min_jk = min(J,K)
  unadjusted_rand_index = 0.0
  for i in range(min_jk+1):
    unadjusted_rand_index = unadjusted_rand_index + (agreement[i] / N)
  

  expected_rand_index = 0.0
  for i in range(min_jk+1):
    expected_rand_index = expected_rand_index + ((rowsum[i] * colsum[i]) / (N * N))
  
  #omega_score = ((N * unadjusted_rand_index) - expected_rand_index)/((N*N) - expected_rand_index)
  
  omega_score = (unadjusted_rand_index - expected_rand_index)/(1 - expected_rand_index)
  if omega_score < 0 :
    omega_score = 0
    #No agreement between s1 and s2
  return round(omega_score,2)

In [31]:
def create_shen_precision_table(s1, s2):
  #Definition : https://www.microsoft.com/en-us/research/wp-content/uploads/2006/01/p35-shen.pdf (pg 40)
  table = []
  contingency,rows_sums,col_sums,gold_name,auto_name = create_contingency_table(s1,s2)
  X = rows_sums
  Y = col_sums

  a = 0
  b = 0
  for i, v1 in s1.items():
    table_row = []
    n_i = X[a]
    b = 0
    for j, v2 in s2.items():
      n_j = Y[b]
      if(n_j == 0):
        table_row.append(0)
      else:
        table_row.append(len(v1.intersection(v2))/n_j)
      b = b+1
    table.append(table_row)
    a=a+1
  table = np.array(table)
  
  return table

In [32]:
def create_shen_recall_table(s1, s2):
  #Definition : https://www.microsoft.com/en-us/research/wp-content/uploads/2006/01/p35-shen.pdf (pg 40)
  table = []
  contingency,rows_sums,col_sums,gold_name,auto_name = create_contingency_table(s1,s2)
  X = rows_sums
  Y = col_sums

  a = 0
  b = 0
  for i, v1 in s1.items():
    table_row = []
    n_i = X[a]
    b = 0
    for j, v2 in s2.items():
      n_j = Y[b]
      if n_i == 0:
        table_row.append(0)
      else:
        table_row.append(len(v1.intersection(v2))/n_i)
      b = b+1
    table.append(table_row)
    a=a+1
  table = np.array(table)
  
  return table

In [33]:
def create_shen_F_table(s1, s2):
  #Definition : https://www.microsoft.com/en-us/research/wp-content/uploads/2006/01/p35-shen.pdf (pg 40)
  prec_table = create_shen_precision_table(s1,s2)
  recall_table = create_shen_recall_table(s1,s2)
  contingency,rows_sums,col_sums,gold_name,auto_name = create_contingency_table(s1,s2)
  X = rows_sums
  Y = col_sums
  table = []

  a = 0
  b = 0
  max_f = []
  for i, v1 in s1.items():
    table_row = []
    n_i = X[a]
    b = 0
    for j, v2 in s2.items():
      n_j = Y[b]
      if (prec_table[a][b] + recall_table[a][b]) != 0:
        f = (2 * prec_table[a][b] * recall_table[a][b])/(prec_table[a][b] + recall_table[a][b])
      else:
        f = 0
      table_row.append(f)
      b = b+1
    #Select Max_{j} F(i,j) for every row i
    table.append((np.argmax(table_row),max(table_row)))
    a=a+1

  return table

In [34]:
def get_shen_f1(s1,s2):
  #Refer : https://github.com/jkkummerfeld/irc-disentanglement/blob/master/tools/evaluation/conversation-eval.py
  #Definition : https://www.microsoft.com/en-us/research/wp-content/uploads/2006/01/p35-shen.pdf (pg 40)
  contingency,rows_sums,col_sums,gold_name,auto_name = create_contingency_table(s1,s2)
  X = rows_sums
  Y = col_sums
  table = create_shen_F_table(s1,s2)
  f_table = []
  f_table_index = []
  
  sum_f = 0.0
  n = np.sum(Y)

  for (a,b) in table:
    if a not in f_table_index:
      f_table.append(b)
      f_table_index.append(a)
    else:
      continue

  for i in range(len(f_table)):
      sum_f = sum_f + ((X[i]) * f_table[i])
  sum_f = sum_f / n
  if sum_f > 1:
        sum_f = 1.0
  return(round(sum_f,3))

In [35]:
model1_test = pd.read_csv('/content/drive/MyDrive/inference.forum.test.out', header=None) #2019 ACL Model by Kummerfeld et al. trained on IRC
model2_test = pd.read_csv('/content/drive/MyDrive/inference.forum.test.1.out', header=None) #2019 ACL Model by Kummerfeld et al. trained on Forum
model3_test = pd.read_csv('/content/drive/MyDrive/inference-forum.ptr.out', header=None) #2020 EMNLP Model by Yu et al. trained on IRC
model4_test = pd.read_csv('/content/drive/MyDrive/train.forum.ptr.test.3.out', header=None) #2020 EMNLP Model by Yu et al. trained on Forum
model5_test = pd.read_csv('/content/drive/MyDrive/inference-date.out', header=None) #2020 EMNLP Model by Yu et al. trained on Forum date features changed
model6_test = pd.read_csv('/content/drive/MyDrive/inference-segmentation.forum.test.out', header=None) #2020 EMNLP Model by Yu et al. trained on Forum Segmentation and Removed punctuation
model7_test = pd.read_csv('/content/drive/MyDrive/infer-forum-contraction.out', header=None) #2020 EMNLP Model by Yu et al. trained on Forum Segmentation and Removed punctuation
model8_test = pd.read_csv('/content/drive/MyDrive/inference-measure.out', header=None) #2020 EMNLP Model by Yu et al. New Forum Features


In [36]:
threadID_list = []
parent_list = []
child_list = []
for i in list(model1_test[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    thread = path[0].split('.')
    threadID = thread[3]
    anno = path[1].split(' ')
    parent = anno[1]
    child = anno[0]
    threadID_list.append(int(threadID))
    parent_list.append(int(parent))
    child_list.append(int(child))
preds = pd.DataFrame()
preds['ThreadID'] = threadID_list
preds['P'] = parent_list
preds['C'] = child_list
preds = preds[preds['P']!=preds['C']]
preds = preds.sort_values(['ThreadID','P','C'])
preds = preds.reset_index(drop=True)

In [37]:
preds #predictions by model1 (Kummerfeld trained on IRC, inference prediction on our Forum test dataset) -> post id is relative for the thread

Unnamed: 0,ThreadID,P,C
0,66,4,5
1,66,6,12
2,69,8,10
3,69,8,12
4,69,8,15
...,...,...,...
10679,22614,10,16
10680,22614,11,20
10681,22614,13,14
10682,22614,17,21


In [38]:
threadID2_list = []
parent2_list = []
child2_list = []
for i in list(model2_test[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    thread = path[0].split('.')
    threadID = thread[3]
    anno = path[1].split(' ')
    parent = anno[1]
    child = anno[0]
    threadID2_list.append(int(threadID))
    parent2_list.append(int(parent))
    child2_list.append(int(child))
preds2 = pd.DataFrame()
preds2['ThreadID'] = threadID2_list
preds2['P'] = parent2_list
preds2['C'] = child2_list
preds2 = preds2[preds2['P']!=preds2['C']]
preds2 = preds2.sort_values(['ThreadID','P','C'])
preds2 = preds2.reset_index(drop=True)

In [39]:
preds2 #predictions by model1 (Kummerfeld trained on Forum dataset, inference prediction on our Forum test dataset) -> post id is relative for the thread

Unnamed: 0,ThreadID,P,C
0,66,0,1
1,66,1,2
2,66,2,3
3,66,2,7
4,66,3,4
...,...,...,...
19061,22614,18,19
19062,22614,18,20
19063,22614,18,22
19064,22614,20,21


In [40]:
threadID3_list = []
parent3_list = []
child3_list = []
for i in list(model3_test[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    thread = path[0].split('.')
    threadID = thread[2]
    anno = path[1].split(' ')
    parent = anno[1]
    child = anno[0]
    threadID3_list.append(int(threadID))
    parent3_list.append(int(parent))
    child3_list.append(int(child))
preds3 = pd.DataFrame()
preds3['ThreadID'] = threadID3_list
preds3['P'] = parent3_list
preds3['C'] = child3_list
preds3 = preds3[preds3['P']!=preds3['C']]
preds3 = preds3.sort_values(['ThreadID','P','C'])
preds3 = preds3.reset_index(drop=True)

In [41]:
preds3 #predictions on Pointer Network model by Yu et al. 

Unnamed: 0,ThreadID,P,C
0,66,4,5
1,66,4,6
2,66,6,12
3,66,9,10
4,69,4,5
...,...,...,...
10706,22614,11,20
10707,22614,12,16
10708,22614,13,14
10709,22614,13,15


In [42]:
threadID4_list = []
parent4_list = []
child4_list = []
for i in list(model4_test[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    thread = path[0].split('.')
    threadID = thread[2]
    anno = path[1].split(' ')
    parent = anno[1]
    child = anno[0]
    threadID4_list.append(int(threadID))
    parent4_list.append(int(parent))
    child4_list.append(int(child))
preds4 = pd.DataFrame()
preds4['ThreadID'] = threadID4_list
preds4['P'] = parent4_list
preds4['C'] = child4_list
preds4 = preds4[preds4['P']!=preds4['C']]
preds4 = preds4.sort_values(['ThreadID','P','C'])
preds4 = preds4.reset_index(drop=True)

In [43]:
preds4

Unnamed: 0,ThreadID,P,C
0,66,1,2
1,66,2,3
2,66,4,5
3,66,5,6
4,66,6,7
...,...,...,...
16907,22614,18,19
16908,22614,19,20
16909,22614,20,21
16910,22614,21,22


In [44]:
threadID5_list = []
parent5_list = []
child5_list = []
for i in list(model5_test[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    thread = path[0].split('.')
    threadID = thread[2]
    anno = path[1].split(' ')
    parent = anno[1]
    child = anno[0]
    threadID5_list.append(int(threadID))
    parent5_list.append(int(parent))
    child5_list.append(int(child))
preds5 = pd.DataFrame()
preds5['ThreadID'] = threadID5_list
preds5['P'] = parent5_list
preds5['C'] = child5_list
preds5 = preds5[preds5['P']!=preds5['C']]
preds5 = preds5.sort_values(['ThreadID','P','C'])
preds5 = preds5.reset_index(drop=True)

In [45]:
preds5

Unnamed: 0,ThreadID,P,C
0,66,0,1
1,66,0,6
2,66,1,2
3,66,1,3
4,66,1,5
...,...,...,...
18975,22614,17,19
18976,22614,18,20
18977,22614,18,22
18978,22614,19,21


In [46]:
threadID6_list = []
parent6_list = []
child6_list = []
for i in list(model6_test[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    thread = path[0].split('.')
    threadID = thread[2]
    anno = path[1].split(' ')
    parent = anno[1]
    child = anno[0]
    threadID6_list.append(int(threadID))
    parent6_list.append(int(parent))
    child6_list.append(int(child))
preds6 = pd.DataFrame()
preds6['ThreadID'] = threadID6_list
preds6['P'] = parent6_list
preds6['C'] = child6_list
preds6 = preds6[preds6['P']!=preds6['C']]
preds6 = preds6.sort_values(['ThreadID','P','C'])
preds6 = preds6.reset_index(drop=True)

In [47]:
preds6

Unnamed: 0,ThreadID,P,C
0,66,0,1
1,66,1,2
2,66,2,3
3,66,3,4
4,66,5,6
...,...,...,...
18334,22614,14,19
18335,22614,14,20
18336,22614,14,23
18337,22614,20,21


In [48]:
threadID7_list = []
parent7_list = []
child7_list = []
for i in list(model7_test[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    thread = path[0].split('.')
    threadID = thread[2]
    anno = path[1].split(' ')
    parent = anno[1]
    child = anno[0]
    threadID7_list.append(int(threadID))
    parent7_list.append(int(parent))
    child7_list.append(int(child))
preds7 = pd.DataFrame()
preds7['ThreadID'] = threadID7_list
preds7['P'] = parent7_list
preds7['C'] = child7_list
preds7 = preds7[preds7['P']!=preds7['C']]
preds7 = preds7.sort_values(['ThreadID','P','C'])
preds7 = preds7.reset_index(drop=True)

In [49]:
preds7

Unnamed: 0,ThreadID,P,C
0,66,0,1
1,66,0,3
2,66,0,4
3,66,0,5
4,66,0,6
...,...,...,...
18975,22614,15,19
18976,22614,15,20
18977,22614,15,22
18978,22614,15,23


In [50]:
threadID8_list = []
parent8_list = []
child8_list = []
for i in list(model8_test[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    thread = path[0].split('.')
    threadID = thread[2]
    anno = path[1].split(' ')
    parent = anno[1]
    child = anno[0]
    threadID8_list.append(int(threadID))
    parent8_list.append(int(parent))
    child8_list.append(int(child))
preds8 = pd.DataFrame()
preds8['ThreadID'] = threadID8_list
preds8['P'] = parent8_list
preds8['C'] = child8_list
preds8 = preds8[preds8['P']!=preds8['C']]
preds8 = preds8.sort_values(['ThreadID','P','C'])
preds8 = preds8.reset_index(drop=True)

In [51]:
preds8

Unnamed: 0,ThreadID,P,C
0,66,0,1
1,66,0,3
2,66,0,4
3,66,0,13
4,66,1,2
...,...,...,...
18975,22614,13,20
18976,22614,13,21
18977,22614,13,22
18978,22614,13,23


In [52]:
#1000 uniformly random chosen threads with atleast 10 messages / posts.
test_threads = set([66,69,73,79,83,84,93,100,109,113,115,129,134,135,140,147,148,150,154,158,164,166,173,179,180,198,206,223,224,226,228,255,259,260,274,279,284,301,304,306,311,328,336,348,366,367,371,383,384,405,406,407,410,417,420,427,429,436,442,453,459,460,466,476,500,506,507,512,515,516,521,531,532,542,554,556,559,564,569,578,586,589,594,596,600,603,611,621,623,625,629,632,642,644,650,651,652,654,661,680,687,688,699,701,713,715,717,723,726,742,743,750,769,772,785,788,791,794,809,816,818,821,828,829,833,842,845,846,854,856,865,873,881,885,891,896,901,903,910,912,913,915,919,925,935,939,953,960,973,983,985,988,992,1021,1025,1031,1037,1056,1073,1106,1107,1117,1119,1124,1192,1197,1216,1246,1249,1275,1287,1305,1327,1346,1349,1357,1406,1408,1410,1412,1415,1434,1452,1486,1508,1514,1518,1530,1566,1584,1595,1597,1598,1606,1618,1631,1632,1653,1670,1682,1683,1686,1700,1736,1759,1762,1776,1777,1778,1779,1795,1802,1803,1810,1815,1826,1829,1834,1839,1843,1846,1858,1863,1867,1876,1889,1893,1894,1895,1904,1908,1912,1913,1915,1923,1931,1938,1944,1948,1949,1950,1954,1955,1976,1984,1987,1993,1999,2001,2014,2017,2018,2023,2025,2036,2046,2047,2052,2063,2065,2070,2073,2076,2094,2099,2119,2140,2148,2180,2183,2184,2202,2203,2208,2209,2215,2224,2228,2229,2245,2274,2294,2307,2318,2319,2335,2344,2347,2354,2358,2359,2372,2378,2383,2396,2398,2400,2402,2409,2410,2415,2416,2422,2430,2436,2446,2447,2472,2494,2499,2500,2503,2517,2521,2535,2558,2561,2577,2578,2597,2604,2612,2614,2616,2617,2618,2625,2637,2653,2666,2689,2741,2761,2762,2767,2773,2775,2785,2789,2799,2800,2824,2836,2837,2840,2851,2852,2853,2854,2863,2864,2870,2875,2883,2885,2893,2894,2896,2902,2912,2925,2927,2928,2932,2935,2942,2945,2946,2954,2966,2979,2982,2992,2996,3003,3004,3007,3015,3016,3040,3044,3047,3048,3050,3057,3058,3059,3060,3072,3090,3095,3107,3118,3125,3127,3128,3133,3137,3151,3158,3160,3166,3172,3205,3207,3215,3220,3221,3230,3235,3237,3275,3282,3286,3287,3289,3294,3330,3332,3333,3336,3344,3347,3360,3361,3378,3402,3415,3418,3420,3451,3453,3475,3476,3479,3489,3491,3517,3522,3525,3530,3542,3543,3549,3565,3570,3576,3593,3594,3612,3641,3644,3646,3655,3659,3668,3669,3679,3721,3722,3745,3752,3757,3758,3769,3772,3775,3796,3797,3799,3802,3806,3812,3819,3834,3838,3843,3847,3859,3866,3885,3889,3906,3908,3912,3915,3918,3919,3921,3923,3926,3927,3943,3946,3953,3963,3967,3970,3971,3973,12104,12107,12111,12113,12115,21159,12119,12120,12122,12124,12125,12127,21164,21166,21167,12133,12135,21170,21172,12137,12139,21176,21179,12159,21186,21189,12166,21194,21195,21197,12171,12174,21204,21207,21209,12181,12182,21212,21214,12191,21216,21217,21220,12196,21225,21228,21233,12203,12204,12206,21237,21241,12212,12213,12214,12216,12223,21262,21266,21269,21270,21271,12235,21274,21278,21288,12249,21295,21297,21299,12252,12258,12261,21321,21323,21330,21331,21332,21333,21339,21347,21350,21358,21364,21373,12301,21381,21383,12307,12317,21398,12318,12319,12320,12322,21404,21405,12327,21412,21414,12332,12335,21421,21426,21428,21440,21442,12346,21450,21457,12353,21465,21468,21469,21470,21474,12359,21475,21476,21481,21486,21493,21494,21497,21500,21503,21506,21507,12366,21511,21515,21517,21521,21527,21528,21532,21533,21535,21538,21541,21544,12390,21549,21550,21552,21556,21558,21564,12416,21567,21568,21569,12423,21583,21584,21592,21599,21602,12432,12436,21604,21605,21620,21623,21628,21631,21634,21638,21640,12452,21641,21642,21645,21647,12458,12460,21649,21650,21651,21652,21659,21664,12468,21668,21669,21673,21678,12475,21685,21689,21699,21704,21707,21709,12489,21719,21720,12494,12498,21731,12507,21738,21741,12512,12514,21745,21750,21755,21756,21758,21762,21766,21768,12518,21775,21778,21779,21780,21781,12523,21787,21788,21792,12528,21793,21797,12530,21798,12537,21803,12540,12541,21805,21808,21811,21813,12548,12549,21818,21821,21824,21828,21829,21832,21833,12562,21838,21839,21841,21848,21849,21853,21854,21855,21866,21868,12570,12571,12575,21878,12583,21884,21886,21889,21890,21891,12589,21893,21897,21898,12595,21905,21906,21907,21912,21914,21915,21925,21928,21930,12597,21935,21941,21942,21945,21949,21950,21954,21955,21956,21958,21959,21960,12611,12614,12615,21972,21975,21978,12617,21981,21982,21985,21993,12627,12632,21996,22000,12641,12642,22004,22006,22008,22009,22018,22019,22022,12654,22025,22028,22032,12656,22033,22036,22041,22042,22043,12664,22046,22047,22048,22052,22056,12671,12676,22064,22067,22068,12680,12682,12684,12687,22070,22071,22078,22081,12693,12694,22086,12701,22095,22102,22106,12713,22108,22110,22111,22112,22114,22117,22120,22121,22124,22131,22134,12723,22135,12726,22146,22157,12728,22163,22169,22172,22179,22181,22183,22194,22200,22203,12749,22216,22218,22219,22223,22227,12757,22230,22236,22237,22238,22241,22242,22243,22246,22249,22250,22255,22260,12772,22264,22267,12775,22272,22276,22277,22282,22283,22284,22286,12780,12781,22293,22305,22310,12785,22325,12791,12792,12794,22342,12796,22358,22359,22361,22362,22363,22370,22373,12815,22380,22382,12817,22386,22387,22388,12823,22391,22392,22393,22395,22396,22397,12824,12828,22403,12839,22406,22412,22413,12843,12845,22421,22423,22428,22429,12854,22443,12858,22445,12868,12869,12870,22466,22472,22484,22485,12889,22497,22498,22499,22500,22522,22524,22526,12912,22535,22540,22550,22556,22558,12928,12931,12933,22567,22568,22569,12938,22580,12941,22583,12952,12958,12960,22603,12961,12965,12967,12968,22614])

In [53]:
print(len(test_threads)) #Check 1000 threads
thread_list = []
parent_list = []
child_list = []
post_sn = {}
for i in sorted(test_threads):
  if i == '':
    continue
  else:
    
    temp = df[df['ThreadID']==int(i)]
    postIDs = sorted(temp['PostID'])
    
    k = 0
    for j in postIDs:
      post_sn[(int(i),int(j))] = int(k) #Dictionary to get relative post ID given thread ID and post ID 
      k = k + 1 
    s = get_conversation_dag(int(i))
    
    for j,k in s.edges():
      thread_list.append(int(i))

      p = post_sn.get((i,j))
      c = post_sn.get((i,k))

      parent_list.append(p)
      child_list.append(c)

1000


In [54]:
gold = pd.DataFrame()
gold['ThreadID'] = thread_list
gold['P'] = parent_list
gold['C'] = child_list
gold = gold[gold['P']!=gold['C']]
gold = gold.sort_values(['ThreadID','P','C'])
gold = gold.reset_index(drop=True)

In [55]:
gold #ground truth of forum test dataset

Unnamed: 0,ThreadID,P,C
0,66,0,1
1,66,0,3
2,66,1,2
3,66,1,5
4,66,3,4
...,...,...,...
19061,22614,17,18
19062,22614,17,23
19063,22614,19,20
19064,22614,19,22


In [56]:
def get_set_of_sets(s1,name):
  #Given a prediction, get a set of subthreads for each thread
  set_set_list = []
  for i in sorted(list(test_threads)):
    temp = s1[s1['ThreadID']==int(i)]
    p = list(temp['P'])
    c = list(temp['C'])
    edges = []
    for j in range(len(p)):
      edges.append((int(p[j]),int(c[j])))
    all_nodes = list(set(p).union(set(c)))
    G = nx.DiGraph()
    G.add_nodes_from(all_nodes)
    G.add_edges_from(edges)
    chaini = chain.from_iterable
    roots = (n for n,d in G.in_degree() if d==0)
    leaves = (n for n,d in G.out_degree() if d==0)
    all_paths = partial(nx.all_simple_paths, G)
    ans = chaini(starmap(all_paths, product(roots, leaves)))
    set_ans = {}
    k = 0
    for i in ans:
      set_ans[name+str(k)] = set(i)
      k = k + 1
    set_set_list.append(set_ans)
  return set_set_list

In [57]:
gt = get_set_of_sets(gold,'G') #Ground Truth
pr = get_set_of_sets(preds,'AC') #Model 1 ACL 2019
pr2 = get_set_of_sets(preds2,'AF') #Model 2 ACL 2019
pr3 = get_set_of_sets(preds3,'EC') #Model 3 EMNLP 2020
pr4 = get_set_of_sets(preds4,'EF') #Model 4 EMNLP 2020
pr5 = get_set_of_sets(preds5,'DF') #Model 5 EMNLP 2020 Date Feature
pr6 = get_set_of_sets(preds6,'WSF') #Model 5 EMNLP 2020 Word Segmentation No Special Chars
pr7 = get_set_of_sets(preds7,'CWF') #Model 5 EMNLP 2020 Word Segmentation No Special Chars
pr8 = get_set_of_sets(preds8,'NF')

In [58]:
all_models_pr_vs_gt = pd.DataFrame()
all_models_pr_vs_gt['TID'] = sorted(list(test_threads))
all_models_pr_vs_gt['GT'] = gt
all_models_pr_vs_gt['PR'] = pr
all_models_pr_vs_gt['PR2'] = pr2
all_models_pr_vs_gt['PR3'] = pr3
all_models_pr_vs_gt['PR4'] = pr4
all_models_pr_vs_gt['PR5'] = pr5
all_models_pr_vs_gt['PR6'] = pr6
all_models_pr_vs_gt['PR7'] = pr7
all_models_pr_vs_gt['PR8'] = pr8

In [59]:
preds8

Unnamed: 0,ThreadID,P,C
0,66,0,1
1,66,0,3
2,66,0,4
3,66,0,13
4,66,1,2
...,...,...,...
18975,22614,13,20
18976,22614,13,21
18977,22614,13,22
18978,22614,13,23


In [60]:
all_models_pr_vs_gt[all_models_pr_vs_gt['TID']==2052]

Unnamed: 0,TID,GT,PR,PR2,PR3,PR4,PR5,PR6,PR7,PR8
257,2052,"{'G0': {0, 1}, 'G1': {0, 2, 3, 4, 5, 6, 7, 8},...","{'AC0': {0, 2}, 'AC1': {0, 4, 6}, 'AC2': {0, 8...","{'AF0': {0, 1, 2}, 'AF1': {0, 3, 4, 5, 6, 7, 8...","{'EC0': {0, 1}, 'EC1': {0, 2}, 'EC2': {0, 4, 5...","{'EF0': {0, 1, 2, 3}, 'EF1': {4, 5, 6, 7, 8}, ...","{'DF0': {0, 1, 2}, 'DF1': {0, 1, 3, 4}, 'DF2':...","{'WSF0': {0, 1, 2, 3, 4, 5, 6, 7, 8}, 'WSF1': ...","{'CWF0': {0, 1, 2}, 'CWF1': {0, 1, 3, 4, 5, 6,...","{'NF0': {0, 1}, 'NF1': {0, 2}, 'NF2': {0, 4}, ..."


In [61]:
def get_metrics(s1,s2,t):
  one_minus_scaled_VI_list = []
  NVI_list = []
  NMI_list = []
  NID_list = []
  omega_list = []
  one_one_list = []
  shen_f1_list = []

  OMSVI_error = []
  NVI_error = []
  NMI_error = []
  NID_error = []
  OM_error = []
  OVO_error = []
  SF1_error = []

  s1 = list(s1)
  s2 = list(s2)
  t = list(t)
 
  for i in range(len(s1)):
    if len(s1[i])>0 and len(s2[i])>0:
      one_minus_scaled_VI = get_one_minus_scaled_vi(s1[i],s2[i])
      NVI = get_normalized_vi(s1[i],s2[i])
      NMI = 1 - get_nmi_joint(s1[i],s2[i])
      NID = get_normalized_information_distance(s1[i],s2[i])
      OM = get_omega_score(s1[i],s2[i])
      OVO = get_one_to_one(s1[i],s2[i])
      SF1 = get_shen_f1(s1[i],s2[i])
    else:
      one_minus_scaled_VI = 0
      NVI = 0
      NMI = 0
      NID = 0
      OM = 0
      OVO = 0
      SF1 = 0


    if one_minus_scaled_VI < 0.0 or one_minus_scaled_VI > 1.0:
      OMSVI_error.append(i)
    if NVI < 0.0 or NVI > 1.0:
      NVI_error.append(i)
    if NMI < 0.0 or NMI > 1:
      NMI_error.append(i)
    if NID < 0.0 or NID > 1:
      NID_error.append(i) 
    if OM < 0.0 or OM > 1.0:
      OM_error.append(i)
    if SF1 < 0.0 or SF1 > 1.0:
      SF1_error.append(i)

    one_minus_scaled_VI_list.append(one_minus_scaled_VI)
    NVI_list.append(NVI)
    NMI_list.append(NMI)
    NID_list.append(NID)
    omega_list.append(OM)
    one_one_list.append(OVO)
    shen_f1_list.append(SF1)

  errors = (NMI_error, NID_error, OM_error, OVO_error, SF1_error)
  
  metrics = pd.DataFrame()
  metrics['TID'] = t
  metrics['Gold Cluster'] = s1
  metrics['Model Predictions'] = s2
  metrics['1-Scaled VI'] = one_minus_scaled_VI_list
  metrics['1-Normalized VI'] = NVI_list
  metrics['NMI'] = NMI_list
  metrics['NID'] = NID_list
  metrics['Omega'] = omega_list
  metrics['one_to_one'] = one_one_list
  metrics['shen_F1'] = shen_f1_list
  return metrics,errors

In [None]:
metrics,e1 = get_metrics(all_models_pr_vs_gt['GT'], all_models_pr_vs_gt['PR'],all_models_pr_vs_gt['TID'])

  after removing the cwd from sys.path.


In [None]:
metrics2,e2 = get_metrics(all_models_pr_vs_gt['GT'], all_models_pr_vs_gt['PR2'],all_models_pr_vs_gt['TID'])

In [None]:
metrics3,e3 = get_metrics(all_models_pr_vs_gt['GT'], all_models_pr_vs_gt['PR3'],all_models_pr_vs_gt['TID'])

In [None]:
metrics4,e4 = get_metrics(all_models_pr_vs_gt['GT'], all_models_pr_vs_gt['PR4'],all_models_pr_vs_gt['TID'])

In [None]:
metrics5,e5 = get_metrics(all_models_pr_vs_gt['GT'], all_models_pr_vs_gt['PR5'],all_models_pr_vs_gt['TID'])

In [None]:
metrics6,e6 = get_metrics(all_models_pr_vs_gt['GT'], all_models_pr_vs_gt['PR6'],all_models_pr_vs_gt['TID'])

In [None]:
metrics7,e7 = get_metrics(all_models_pr_vs_gt['GT'], all_models_pr_vs_gt['PR7'],all_models_pr_vs_gt['TID'])

In [None]:
metrics8,e8 = get_metrics(all_models_pr_vs_gt['GT'], all_models_pr_vs_gt['PR8'],all_models_pr_vs_gt['TID'])

In [None]:
metrics8.describe()

In [None]:
metrics3.describe() #model3 EMNLP2020 Pointer Network Pretrained

In [None]:
metrics.describe() #model 1 ACL 2019 Pretrained 

In [None]:
pd.set_option('max_colwidth', -1)

In [None]:
metrics8[metrics8['Omega']<0]

In [None]:
a = metrics8[['TID','Gold Cluster','Model Predictions','1-Normalized VI','one_to_one','Omega','shen_F1']]

In [None]:
b = metrics3[['TID','Model Predictions','1-Normalized VI','one_to_one','Omega','shen_F1']]

In [None]:
analysis = a.merge(b, on='TID',suffixes=('_E20_OurData', '_E20_PreTrain'))

In [None]:
analysis.columns

In [None]:
df[df['ThreadID']==22603].reset_index()

In [None]:
a[a['TID']==22614]

In [None]:
a['Gold Cluster'][999]

In [None]:
a['Model Predictions'][999]

In [None]:
b['Model Predictions'][999]

In [None]:
b[b['TID']==22614]

In [None]:
def get_avg_jaccard_similarity(s):
  set_list = []
  for _,i in s.items():
    set_list.append(i)
  jaccard = []
  for i,j in combinations(set_list,2):
    j_sim = len(i.intersection(j))/len(i.union(j))
    jaccard.append(j_sim)
  if len(jaccard)>0:
    ans = np.mean(jaccard)
  else:
    ans = 0.0
  return (ans)

In [None]:
def get_avg_overlap_coefficient(s):
  set_list = []
  for _,i in s.items():
    set_list.append(i)
  overlap = []
  for i,j in combinations(set_list,2):
    if min(len(i),len(j)) != 0:
      o_sim = len(i.intersection(j))/min(len(i),len(j))
    else:
      o_sim = 0
    overlap.append(o_sim)
  if len(overlap)>0:
    ans = np.mean(overlap)
  else:
    ans = 0.0
  return (ans)

In [None]:
def get_avg_unique_authors_per_subthread(s,tid):
  set_list = []
  for _,i in s.items():
    set_list.append(i)
  authors_all_subthreads = []
  for i in set_list:
    author_list = []
    for j in i:
      auth = thread_pid_author_dic.get((tid,j),'')
      if len(auth)>0:
        author_list.append(thread_pid_author_dic.get((tid,j)))
    authors_all_subthreads.append(len(set(author_list)))
  if len(authors_all_subthreads)>0:
    ans = np.mean(authors_all_subthreads)
  else:
    ans = 0
  return ans

In [None]:
G = []
U = []
T = []

for i in list(analysis['TID']):
  tid = i
  clustering_GOLD = list(analysis[analysis['TID']==i]['Gold Cluster'])
  clustering_US = list(analysis[analysis['TID']==i]['Model Predictions_E20_OurData'])
  clustering_THEM = list(analysis[analysis['TID']==i]['Model Predictions_E20_PreTrain'])
  gold_score = []
  us_score = []
  them_score = []


  G.append(math.floor(get_avg_unique_authors_per_subthread(clustering_GOLD[0],tid)))
  U.append(math.floor(get_avg_unique_authors_per_subthread(clustering_US[0],tid)))
  T.append(math.floor(get_avg_unique_authors_per_subthread(clustering_THEM[0],tid)))

In [None]:
analysis['Gold_Authors'] = G
analysis['OurModel_Authors'] = U
analysis['PreTrainModel_Authors'] = T

In [None]:
analysis['Gold_Number_SubThreads'] = analysis['Gold Cluster'].apply(lambda x: len(x))
analysis['OurModel_Number_SubThreads'] = analysis['Model Predictions_E20_OurData'].apply(lambda x: len(x))
analysis['PreTrainModel_Number_SubThreads'] = analysis['Model Predictions_E20_PreTrain'].apply(lambda x: len(x))
analysis['Gold_Longest_SubThreads'] = analysis['Gold Cluster'].apply(lambda x: max([len(i) for i in x],default = 0))
analysis['OurModel_Longest_SubThreads'] = analysis['Model Predictions_E20_OurData'].apply(lambda x: max([len(i) for i in x],default = 0))
analysis['PreTrainModel_Longest_SubThreads'] = analysis['Model Predictions_E20_PreTrain'].apply(lambda x: max([len(i) for i in x],default = 0))
analysis['Gold_Number_Posts'] = analysis['Gold Cluster'].apply(lambda x: get_n(x))
analysis['OurModel_Number_Posts'] = analysis['Model Predictions_E20_OurData'].apply(lambda x: get_n(x))
analysis['PreTrainModel_Number_Posts'] = analysis['Model Predictions_E20_PreTrain'].apply(lambda x: get_n(x))
analysis['Gold_Jaccard'] = analysis['Gold Cluster'].apply(lambda x: get_avg_jaccard_similarity(x))
analysis['OurModel_Jaccard'] = analysis['Model Predictions_E20_OurData'].apply(lambda x: get_avg_jaccard_similarity(x))
analysis['PreTrainModel_Jaccard'] = analysis['Model Predictions_E20_PreTrain'].apply(lambda x: get_avg_jaccard_similarity(x))
analysis['Gold_Overlap'] = analysis['Gold Cluster'].apply(lambda x: get_avg_overlap_coefficient(x))
analysis['OurModel_Overlap'] = analysis['Model Predictions_E20_OurData'].apply(lambda x: get_avg_overlap_coefficient(x))
analysis['PreTrainModel_Overlap'] = analysis['Model Predictions_E20_PreTrain'].apply(lambda x: get_avg_overlap_coefficient(x))

In [None]:
analysis.describe()

In [None]:
qualitative = [66,69,73,83,84,22569,22580,22583,22603,22614]

In [None]:
analysis[analysis['TID'].isin(qualitative)].describe()

In [None]:
analysis[analysis['one_to_one_E20_OurData']<analysis['one_to_one_E20_PreTrain']].describe()

In [None]:
analysis[analysis['one_to_one_E20_OurData']==analysis['one_to_one_E20_PreTrain']].describe()

In [None]:
analysis[analysis['one_to_one_E20_OurData']>analysis['one_to_one_E20_PreTrain']].describe()

In [None]:
analysis[analysis['1-Normalized VI_E20_OurData']<analysis['1-Normalized VI_E20_PreTrain']].describe()

In [None]:
analysis[analysis['1-Normalized VI_E20_OurData']==analysis['1-Normalized VI_E20_PreTrain']].describe()

In [None]:
analysis[analysis['1-Normalized VI_E20_OurData']>analysis['1-Normalized VI_E20_PreTrain']].describe()

In [None]:
analysis[analysis['Omega_E20_OurData']<analysis['Omega_E20_PreTrain']].describe()

In [None]:
analysis[analysis['Omega_E20_OurData']==analysis['Omega_E20_PreTrain']].describe()

In [None]:
analysis[analysis['Omega_E20_OurData']>analysis['Omega_E20_PreTrain']].describe()

In [None]:
analysis[analysis['shen_F1_E20_OurData']<analysis['shen_F1_E20_PreTrain']].describe()

In [None]:
analysis[analysis['shen_F1_E20_OurData']==analysis['shen_F1_E20_PreTrain']].describe()

In [None]:
analysis[analysis['shen_F1_E20_OurData']>analysis['shen_F1_E20_PreTrain']].describe()

In [None]:
worst_performing_threads = set(analysis[analysis['shen_F1_E20_OurData']<analysis['shen_F1_E20_PreTrain']]['TID']).intersection(set(analysis[analysis['Omega_E20_OurData']<analysis['Omega_E20_PreTrain']]['TID']).intersection(set(analysis[analysis['1-Normalized VI_E20_OurData']<analysis['1-Normalized VI_E20_PreTrain']]['TID'])).intersection(set(analysis[analysis['one_to_one_E20_OurData']<analysis['one_to_one_E20_PreTrain']]['TID'])))

In [None]:
len(worst_performing_threads)

In [None]:
analysis[analysis.TID.isin(worst_performing_threads)].describe()

In [None]:
analysis.columns

In [None]:
bad_performing_threads = set(analysis[analysis['shen_F1_E20_OurData']<analysis['shen_F1_E20_PreTrain']]['TID']).union(set(analysis[analysis['Omega_E20_OurData']<analysis['Omega_E20_PreTrain']]['TID']).union(set(analysis[analysis['1-Normalized VI_E20_OurData']<analysis['1-Normalized VI_E20_PreTrain']]['TID'])).union(set(analysis[analysis['one_to_one_E20_OurData']<analysis['one_to_one_E20_PreTrain']]['TID'])))

In [None]:
U0 = set(analysis['TID'])
U1 = set(analysis[analysis['shen_F1_E20_OurData']<=analysis['shen_F1_E20_PreTrain']]['TID'])
U2 = set(analysis[analysis['1-Normalized VI_E20_OurData']<=analysis['1-Normalized VI_E20_PreTrain']]['TID'])
U3 = set(analysis[analysis['Omega_E20_OurData']<=analysis['Omega_E20_PreTrain']]['TID'])
U4 = set(analysis[analysis['one_to_one_E20_OurData']<=analysis['one_to_one_E20_PreTrain']]['TID'])
A = U0 - (U1.union(U2.union(U3.union(U4))))
B = U1 - (U2.union(U3.union(U4)))
C = U2 - (U1.union(U3.union(U4)))
D = U4 - (U1.union(U2.union(U3)))
E = U3 - (U1.union(U2.union(U3)))
F = (U1.intersection(U2)) - (U3.union(U4))
G = (U1.intersection(U4)) - (U2.union(U3))
H = (U2.intersection(U3)) - (U1.union(U4))
I = (U3.intersection(U4)) - (U1.union(U2))
J = (U1.intersection(U2.intersection(U4))) - U3
K = (U1.intersection(U3.intersection(U4))) - U2
L = (U1.intersection(U2.intersection(U3))) - U4
M = (U2.intersection(U3.intersection(U4))) - U1
N = (U1.intersection(U2.intersection(U3.intersection(U4))))

In [None]:
print(len(A))

In [None]:
analysis[analysis.TID.isin(A)].describe()

In [None]:
print(len(B))

In [None]:
analysis[analysis.TID.isin(B)].describe()

In [None]:
print(len(C))

In [None]:
analysis[analysis.TID.isin(C)].describe()

In [None]:
print(len(D))

In [None]:
analysis[analysis.TID.isin(D)].describe()

In [None]:
print(len(E))

In [None]:
analysis[analysis.TID.isin(E)].describe()

In [None]:
print(len(F))

In [None]:
analysis[analysis.TID.isin(F)].describe()

In [None]:
print(len(G))

In [None]:
analysis[analysis.TID.isin(G)].describe()

In [None]:
print(len(H))

In [None]:
analysis[analysis.TID.isin(H)].describe()

In [None]:
print(len(I))

In [None]:
analysis[analysis.TID.isin(I)].describe()

In [None]:
print(len(J))

In [None]:
analysis[analysis.TID.isin(J)].describe()

In [None]:
print(len(K))

In [None]:
analysis[analysis.TID.isin(K)].describe()

In [None]:
print(len(L))

In [None]:
analysis[analysis.TID.isin(L)].describe()

In [None]:
print(len(M))

In [None]:
analysis[analysis.TID.isin(M)].describe()

In [None]:
print(len(N))

In [None]:
analysis[analysis.TID.isin(N)].describe()

In [None]:
analysis[analysis.TID.isin(J.union(K.union(L.union(M))))].describe()

In [None]:
analysis[analysis.TID.isin(F.union(G.union(H.union(I))))].describe()

In [None]:
analysis[analysis.TID.isin(B.union(C.union(D.union(E))))].describe()

In [None]:
analysis[analysis.TID.isin(A)].describe()

In [None]:
analysis[analysis.TID.isin(U0)].describe()

In [None]:
len(U0.intersection(A))

In [None]:
len(A)

In [None]:
len(B.union(C.union(D.union(E))))

In [None]:
len(F.union(G.union(H.union(I))))

In [None]:
len(J.union(K.union(L.union(M))))

In [None]:
len(N)

In [None]:
#IRC Data for Reference
irc_model = pd.read_csv('/content/drive/MyDrive/inference.irc.test.1.out',header=None)
irc_gold = pd.read_csv('/content/drive/MyDrive/irc_gold.out',header=None)

In [None]:
irc_model

In [None]:
irc_gold

In [None]:
threadID_list = []
parent_list = []
child_list = []
for i in list(irc_model[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    thread = path[0].split('/')
    threadID = thread[3].split('_')[0]
    anno = path[1].split(' ')
    parent = anno[1]
    child = anno[0]
    threadID_list.append(str(threadID))
    parent_list.append(int(parent))
    child_list.append(int(child))
irc = pd.DataFrame()
irc['ThreadID'] = threadID_list
irc['P'] = parent_list
irc['C'] = child_list
irc = irc[irc['P']!=irc['C']]
irc = irc.sort_values(['ThreadID','P','C'])
irc = irc.reset_index(drop=True)

In [None]:
irc = irc[irc['C']>999]

In [None]:
irc

In [None]:
irc_threads = sorted(set(irc['ThreadID']))

In [None]:
irc_threads

In [None]:
def get_irc_set_of_sets(s1,name):
  #Given a prediction, get a set of subthreads for each thread
  set_set_list = []
  for i in irc_threads:
    temp = s1[s1['ThreadID']==str(i)]
    p = list(temp['P'])
    c = list(temp['C'])
    edges = []
    for j in range(len(p)):
      edges.append((int(p[j]),int(c[j])))
    all_nodes = list(set(p).union(set(c)))
    G = nx.DiGraph()
    G.add_nodes_from(all_nodes)
    G.add_edges_from(edges)
    chaini = chain.from_iterable
    roots = (n for n,d in G.in_degree() if d==0)
    leaves = (n for n,d in G.out_degree() if d==0)
    all_paths = partial(nx.all_simple_paths, G)
    ans = chaini(starmap(all_paths, product(roots, leaves)))
    set_ans = {}
    k = 0
    for i in ans:
      set_ans[name+str(k)] = set(i)
      k = k + 1
    set_set_list.append(set_ans)
  return set_set_list

In [None]:
irc_pr = get_irc_set_of_sets(irc,'IP')

In [None]:
irc_gold

In [None]:
irc_gold_threads = ['2005-07-06'] *  506 + ['2007-01-11'] * 520 + ['2007-12-01'] * 516 +['2008-07-14'] * 528 + ['2010-08-17'] * 522 + ['2013-09-01'] * 532 + ['2014-06-18'] * 517 + ['2015-03-18'] * 517 + ['2016-02-22'] * 518 + ['2016-06-08'] * 511

In [None]:
irc_gold['Threads'] = irc_gold_threads

In [None]:
irc_gold_th = irc_gold['Threads'] + [':']*len(irc_gold[0]) + irc_gold[0]

In [None]:
irc_gold = pd.DataFrame()
irc_gold[0]=irc_gold_th

In [None]:
irc_gold

In [None]:
threadID_list = []
parent_list = []
child_list = []
for i in list(irc_gold[0]):
  if i[0]=='#':
    continue
  else:
    path = i.split(':')
    threadID = path[0]
    anno = path[1].split(' ')
    parent = anno[0]
    child = anno[1]
    threadID_list.append(str(threadID))
    parent_list.append(int(parent))
    child_list.append(int(child))
ircg = pd.DataFrame()
ircg['ThreadID'] = threadID_list
ircg['P'] = parent_list
ircg['C'] = child_list
ircg = ircg[ircg['P']!=ircg['C']]
ircg = ircg.sort_values(['ThreadID','P','C'])
ircg = ircg.reset_index(drop=True)

In [None]:
ircg

In [None]:
#irc_gt = get_irc_set_of_sets(ircg,'IG')

In [None]:
#irc_metrics = get_metrics(irc_gt,irc_pr)

In [None]:
#irc_metrics.describe()

In [None]:
thread20_id = []
for i in range(max(list(df['ThreadID']))+1):
  temp = df[df['ThreadID']==i]
  if len(temp['PostID']) == 20:
    thread20_id.append(i)

In [None]:
thread20_id

In [None]:
df[df['ThreadID']==173]

In [None]:
preds7[preds7['ThreadID']==828]

In [None]:
preds3[preds3['ThreadID']==828]

In [None]:
preds[preds['ThreadID']==828]

In [None]:
gold[gold['ThreadID']==828]

In [None]:
for i,j in enumerate(list(df[df['ThreadID']==828]['Post'])):
  print(i,j)

In [None]:
for i,j in enumerate(list(df[df['ThreadID']==828]['Author'])):
  print(i,j)

In [None]:
df[df['ThreadID']==173]