<a href="https://colab.research.google.com/github/anandraiyer/access_forums_eval/blob/main/testing_framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import pandas as pd
import numpy as np
import math

In [2]:
data = pd.read_csv('/content/drive/MyDrive/final.csv')

In [3]:
data.columns

Index(['Thread', 'DateTime', 'Author', 'Post', 'ParentPosts', 'PostID',
       'ThreadID', 'AuthorID', 'OriginID', 'DialogAct', 'ParentID_List'],
      dtype='object')

In [4]:
df = data[['Thread','ThreadID','PostID','DateTime','Author','Post','ParentID_List']]

In [5]:
df.head()

Unnamed: 0,Thread,ThreadID,PostID,DateTime,Author,Post,ParentID_List
0,Testing the new Site!,1,0,2015-11-30 11:59:19.544597,Tim Ford,Thank you very much for all those who worked o...,-1
1,Netflix not accessible to blind people using a...,2,1,2015-11-30 12:05:11.019288,Tim Ford,"Hi All, For those out there who want to use N...",-1
2,Testing the new Site!,1,2,2015-11-30 12:06:01.976409,"Walker, Michael E","Hi Tim, the group is working fine. I got your ...",[0]
3,Testing the new Site!,1,3,2015-11-30 12:12:07.800324,ken lawrence,Should the JDH mail be deleted?,[0]
4,Netflix not accessible to blind people using a...,2,4,2015-11-30 12:14:27.873186,Greg Nickel,Will do…,[1]


In [6]:
from itertools import chain
from itertools import product
from itertools import starmap
from functools import partial
import networkx as nx

In [7]:
def get_conversation_dag(tid):
  temp = df[df['ThreadID']==tid]

  threads = list(temp['Thread'])
  postids = list(temp['PostID'])
  posts = list(temp['Post'])
  datetimes = list(temp['DateTime'])
  authors = list(temp['Author'])
  parentids = list(temp['ParentID_List'])

  G = nx.DiGraph()
  G.add_nodes_from(posts)

  edges = []
  for i in range(len(postids)):
    parent = parentids[i]
    child = postids[i]

    if parent == "-1":
      continue
    else:
      parent = int(parent[1:-1])
    edges.append((parent,child))
  G.add_edges_from(edges)
  return G

In [8]:
def get_subthreads(tid):
  temp = df[df['ThreadID']==tid]

  threads = list(temp['Thread'])
  postids = list(temp['PostID'])
  posts = list(temp['Post'])
  datetimes = list(temp['DateTime'])
  authors = list(temp['Author'])
  parentids = list(temp['ParentID_List'])

  G = nx.DiGraph()
  G.add_nodes_from(posts)

  edges = []
  for i in range(len(postids)):
    parent = parentids[i]
    child = postids[i]

    if parent == "-1":
      continue
    else:
      parent = int(parent[1:-1])
    edges.append((parent,child))
  G.add_edges_from(edges)
  chaini = chain.from_iterable
  roots = (n for n,d in G.in_degree() if d==0)
  leaves = (n for n,d in G.out_degree() if d==0)
  all_paths = partial(nx.all_simple_paths, G)
  ans = list(chaini(starmap(all_paths, product(roots, leaves))))
  return(ans)

In [16]:
get_subthreads(1024)

[[5074, 5076, 5078], [5074, 5076, 5079]]

In [22]:
def get_list_subthreads(cluster_name,threadID):
  conversations = []
  for i, val in enumerate(get_subthreads(threadID)):
    conversations.append(cluster_name+str(i)+":"+str(val)[1:-1].replace(',',''))
  return conversations

In [23]:
get_list_subthreads('C',4445)

['C0:21680 21681', 'C1:21680 21682', 'C2:21680 21683']

In [24]:
def read_clusters(filename):
    # Read provided data
    clusters = {}
    cfile = ""
    all_points = set()
    for line in filename:
        if ':' in line:
            cfile = ':'.join(line.split(':')[:-1]).split('/')[-1]
            line = line.split(":")[-1]
        cluster = {int(v) for v in line.split()}
        clusters.setdefault(cfile, []).append(cluster)
        for v in cluster:
            all_points.add("{}:{}".format(cfile, v))
    return clusters, all_points

In [82]:
a,b = read_clusters(get_list_subthreads('C',4445))

In [83]:
a

{'C0': [{21680, 21681}], 'C1': [{21680, 21682}], 'C2': [{21680, 21683}]}

In [84]:
b

{'C0:21680', 'C0:21681', 'C1:21680', 'C1:21682', 'C2:21680', 'C2:21683'}

In [178]:
def create_contingency_table(gold, auto):
  table = []
  names_gold = []
  names_auto = []
  for i, _ in gold.items():
    names_gold.append(i)
  for i, _ in auto.items():
    names_auto.append(i)
  for i, v1 in gold.items():
    table_row = []
    for j, v2 in auto.items():
      table_row.append(len(v1.intersection(v2)))
    table.append(table_row)
  table = np.array(table)
  sum_rows = np.sum(table, axis = 1)
  sum_cols = np.sum(table, axis = 0)
  return table, sum_rows, sum_cols, names_gold, names_auto

In [179]:
def get_n(gold):
  u = set([])
  for _,v in gold.items():
    u = u.union(set(v))
  return(len(u))

In [180]:
def get_length_clustering(s):
  list_len = []
  for _,v in s.items():
    list_len.append(len(v))
  return(list_len)

In [172]:
get_length_clustering(gold)

[3, 3]

In [211]:
gold = {'C1':{0,1,2},'C2':{0,1,3,4},'C3':{0,1,3,5}}

In [212]:
auto = {'CC1':{0,1,2},'CC2':{0,3,4,5}}

In [184]:
c,rows, cols,gn, an = create_contingency_table(gold,gold)

In [182]:
c,rows,cols,gn,an

(array([[3, 1],
        [2, 3],
        [2, 3]]),
 array([4, 5, 5]),
 array([7, 7]),
 ['C1', 'C2', 'C3'],
 ['CC1', 'CC2'])

In [275]:
def get_vi(s1,s2):
  c,c_rows,c_cols,g_name, a_name = create_contingency_table(s1,s2)
  #print(c)
  N = get_n(s1)
  
  H_uv = 0.0
  I_uv = 0.0

  X = get_length_clustering(s1)
  Y = get_length_clustering(s2)
  total = N
  for i in range(len(c_rows)):
    for j in range(len(c_cols)):
      if c[i][j] != 0:
        num = c[i][j]
        A = c[i][j] / X[i]
        B = c[i][j] / Y[j]
        H_uv = H_uv - (num / total) * math.log(num / total, 2)
        I_uv = I_uv + (num / total) * math.log(num * total / (X[i] * Y[j]), 2)
      else:
        continue
  max_score = math.log(total, 2)
  VI = H_uv - I_uv
    
  scaled_VI = VI / max_score
  #print(sum_vi,math.log(total,2),scaled_vi,1-scaled_vi)
  return (1 - scaled_VI)

In [297]:
def get_one_to_one(s1,s2):
  contingency,rows_sums,col_sums,gold_name,auto_name = create_contingency_table(s1,s2)
  X = get_length_clustering(s1)
  Y = get_length_clustering(s2)
  N = get_n(s1)
  B = nx.Graph()
  left_nodes = []
  for i in gold_name:
    left_nodes.append('Left_'+str(i))
  right_nodes = []
  for i in auto_name:
    right_nodes.append('Right_'+str(i))
  B.add_nodes_from(left_nodes, bipartite=0)
  B.add_nodes_from(right_nodes, bipartite=1)

  for i in range(len(X)):
    for j in range(len(Y)):
      B.add_edge(left_nodes[i], right_nodes[j], weight = contingency[i][j])
  
  matches = nx.algorithms.matching.max_weight_matching(B)
  one_one_ratio = 0.0
  for u,v in matches:
    one_one_ratio = one_one_ratio + (100.0 * B.get_edge_data(u,v)['weight']/N)
  one_one_ratio = one_one_ratio
  return(one_one_ratio)


In [287]:
gold 

{'C0': {1, 2, 3, 4}, 'C1': {5, 6}}

In [288]:
auto

{'C0': {1, 2, 5, 6}, 'C1': {3, 4}}

In [298]:
get_one_to_one(gold,gold)

100.0

In [300]:
gold = {'C0':{1,2,3,4},'C1':{5,6}}
auto = {'C0':{1,2,5,6},'C1':{3,4}}
VI = get_vi(gold, auto)
one_one = get_one_to_one(gold,auto)

print(VI, one_one)

0.4841962570206112 66.66666666666667
