In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
# Download the files from the links below, ,unzip them, and set the file path prefix in the cell below.
# https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Coauthor.zip
# https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Author.zip
# https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Paper.rar

In [3]:
file_path_prefix = "./"

# Processing Authors

In [4]:
with open(file_path_prefix + "AMiner-Author.txt") as file:
    author_lines = [line.rstrip() for line in file]

In [5]:
author_data = dict()
i = 0
while i < len(author_lines):
    while author_lines[i][:6] != "#index" and i < len(author_lines):
        i = i + 1
    author_id = author_lines[i][7:]
    name = author_lines[i+1][3:]
    author_data[author_id] = name
    i = i + 10

In [6]:
print(f"#authors: {len(author_data)}")

#authors: 1712433


# Processing Coauthorship Network

In [7]:
with open(file_path_prefix + "AMiner-Coauthor.txt") as file:
    coauthor_lines = [line.rstrip() for line in file]

In [8]:
min_papers_together = 5

In [9]:
tuples = [line.split() for line in coauthor_lines]
edges = [(x[0][1:], x[1]) for x in tuples if int(x[2]) > min_papers_together]
print(f"#edges: {len(edges)}")

#edges: 99794


In [10]:
unique_vertices = [(edge[0], author_data[edge[0]]) for edge in edges]
unique_vertices.extend([(edge[1], author_data[edge[1]]) for edge in edges])
unique_vertices = set(unique_vertices)
print(f"#vertices: {len(unique_vertices)}")

#vertices: 82569


# Processing papers


In [11]:
with open(file_path_prefix + "AMiner-Paper.txt") as file:
    paper_lines = [line.rstrip() for line in file if line[:2] == "#c" or line[:2] == "#@" or line[:6] == "#index"]

In [12]:
paper_data = []
for i in np.arange(0, len(paper_lines), step = 3):
    paper_id = paper_lines[i][7:]
    names = paper_lines[i+1][3:].split(';')
    venue = paper_lines[i+2][3:]
    paper_data.append((paper_id, names, venue))

# Finding Vertex Labels

In [13]:
venues = ["VLDB", "SIGMOD", "ICML", "NIPS"]
venue_papers = {}
for v in venues[:-1]:
    venue_papers[v] = [paper for paper in paper_data if v in paper[2]]
venue_papers["NIPS"] = [paper for paper in paper_data if "NIPS" in paper[2] or "NeurIPS" in paper[2]]

In [14]:
def count_paper(author, paper_list):
    return len([paper for paper in paper_list if author in paper[1]])

In [15]:
index_list = [int(vertex[0]) for vertex in unique_vertices]
author_list = [vertex[1] for vertex in unique_vertices]

labelled_vertices = pd.DataFrame(author_list, index = index_list, columns = ['name'])
for v in venues:
    labelled_vertices[v] = [count_paper(author, venue_papers[v]) for author in author_list]

In [16]:
published_authors = labelled_vertices[labelled_vertices.sum(axis=1) > 0]
print(f"Number of authors published at least one paper: {len(published_authors)}")

Number of authors published at least one paper: 5856


# Dominance Models

In [17]:
def remove_common_elements(listA, listB):
    common_elements = {x for x in listA if x in listB}
    return ([x for x in listA if x not in common_elements], [x for x in listB if x not in common_elements])

In [18]:
def hash_function(values):
    return values[0] | (values[1] << 7) | (values[2] << 14) | (values[3] << 21)

In [19]:
def dehash(hashed_list):
    return [hashed_list & ((1<<8)-1), (hashed_list>>7) & ((1<<8)-1), (hashed_list>>14) & ((1<<8)-1), (hashed_list>>21) & ((1<<8)-1) ]

In [20]:
def count_values(df):
    dic = {}
    for index, row in df.iterrows():
        hash_value = hash_function(row)
        if hash_value in dic:
            dic[hash_value] += 1
        else:
            dic[hash_value] = 1
    return dic

In [21]:
def bag_difference(listA, listB, df):
    groupA = df.loc[listA]
    groupB = df.loc[listB]
    
    dicA = count_values(groupA)
    dicB = count_values(groupB)
    
    for key in dicA.keys():
        if key in dicB.keys():
            dicA[key] = min(0, dicA[key] - dicB[key])
    
    result = []
    for index,row in groupA.iterrows():
        hash_value = hash_function(np.array(row))
        if hash_value in dicA and dicA[hash_value] > 0:
            result.append(index)
            dicA[hash_value] -= 1
    return result

In [22]:
def traditional_dominates(dominated_by, is_dominated, preferMin=False):
    num_dimensions = len(dominated_by)
    equal_count   = np.equal(dominated_by, is_dominated).sum()
    greater_count = np.greater(dominated_by, is_dominated).sum()

    if equal_count == num_dimensions:
        return False

    elif (not preferMin) and (equal_count + greater_count == num_dimensions):
        return True

    elif preferMin and greater_count == 0:
        return True

    return False

In [23]:
# MIN model
def dominates_min(dominating_group, dominated_group, df):
  dominating_score = np.array(df.loc[dominating_group].min())
  dominated_score = np.array(df.loc[dominated_group].min())
  return traditional_dominates(dominating_score, dominated_score)

In [24]:
#AVG model
def dominates_sum(dominating_group, dominated_group, df):
  dominating_score = np.array(df.loc[dominating_group].mean())
  dominated_score = np.array(df.loc[dominated_group].mean())
  return traditional_dominates(dominating_score, dominated_score)

In [25]:
def group_vs_vertex_general(dominating_group, vertex, df):
  for better_vertex in dominating_group:
    if( traditional_dominates(np.array(df.loc[better_vertex]),\
                              np.array(df.loc[vertex]))):
      return True
  return False

#GENERAL model
def dominates_general(dominating_group_new, dominated_group_new, df):
    dominated_group = bag_difference(dominated_group_new, dominating_group_new, df)
    dominating_group = bag_difference(dominating_group_new, dominated_group_new, df)
    if len(dominated_group) == 0 :
        if len(dominating_group) == 0 :
            return False
        else :
            return True

    for vertex in dominated_group:
        if not group_vs_vertex_general( dominating_group, vertex, df):
            return False
    return True

In [26]:
# geeksforgeeks.org/maximum-bipartite-matching
class GFG:
    def __init__(self, graph):
        self.graph = graph
        self.people = len(graph)
        self.jobs = len(graph[0])
    
    def bpm(self, u, matchR, seen):
        for v in range(self.jobs):
            if self.graph[u][v] and not seen[v]:
                seen[v] = True
                
                if matchR[v] == -1 or self.bpm(matchR[v], matchR, seen):
                    matchR[v] = u
                    return True
        return False
    
    def maxBPM(self):
        matchR = [-1] * self.jobs
        result = 0
        for i in range(self.people):
            seen = [False] * self.jobs
            
            if self.bpm(i,matchR, seen):
                result += 1
        return result

In [27]:
#PERMUTE model
def dominates_permute(dominating_group_new, dominated_group_new, df):
    dominated_group = bag_difference(dominated_group_new, dominating_group_new, df)
    dominating_group = bag_difference(dominating_group_new, dominated_group_new, df)
    
    if len(dominating_group) == 0:
        return False

    dominating_score_min = np.array(df.loc[dominating_group].min())
    dominated_score_max = np.array(df.loc[dominated_group].max())
    if (traditional_dominates(dominating_score_min, dominated_score_max)):
        return True
  
    adjacency_matrix = [[traditional_dominates(np.array(df.loc[u]), np.array(df.loc[v])) for v in dominated_group] for u in dominating_group]
    max_bipartite_matching_size = GFG(adjacency_matrix).maxBPM()
    if max_bipartite_matching_size == len(dominated_group):
        return True
    else:
        return False

# Graph Preprocessing

In [28]:
def find_unvisited_neighbours(v, edges, visited):
  neighbours = [int(e[0]) for e in edges if int(e[1]) == v and int(e[0]) not in visited]
  neighbours.extend([int(e[1]) for e in edges if int(e[0]) == v and int(e[1]) not in visited])
  return neighbours

def label_vertices_recursive(label, label_set, vertex, edges, visited):
  if vertex not in visited:
    label_set[vertex] = label
    visited.add(vertex)
    neighbours = find_unvisited_neighbours(vertex, edges, visited)
    for n in neighbours:
      label_set, visited = label_vertices_recursive(label, label_set, n, edges, visited)
  return (label_set, visited)

def get_component_ids(vertices, edges):
  component_ids = dict()
  visited_vertices = set()

  for v in vertices:
    if v not in visited_vertices:
      # visit everything reachable, label them v, and add them to visited
      component_ids, visited_vertices = label_vertices_recursive(v, component_ids, v, edges, visited_vertices)
  return component_ids

In [29]:
def get_connected_component(fixed_vertex, vertices, component_ids):
  return [v for v in vertices if component_ids[v]==component_ids[fixed_vertex]]

# k-Plex Community Models

In [30]:
def sort_by_l1(vertices, preferMax = True):
    vertices['sum'] = vertices.sum(axis = 1)
    vertices.sort_values(by='sum', ascending = not preferMax, inplace = True)
    vertices.drop(columns=['sum'], inplace = True)
    return vertices

In [31]:
def traditional_skyline(vertices):
    skyline = []
    vertices_by_l1 = sort_by_l1(vertices)

    for vertex_to_check in np.arange(0, len(vertices_by_l1)):
        dominated = False
        for vertex_that_might_dominate_it in skyline:
            if traditional_dominates(vertices_by_l1.iloc[vertex_that_might_dominate_it], vertices_by_l1.iloc[vertex_to_check]):
                dominated = True
                break
        if dominated == False:
            skyline.append(vertex_to_check)
    return vertices_by_l1.iloc[skyline]

In [32]:
def is_kcore(vertices, edges, k=3):
  filtered_vertices, filtered_edges = maximum_kcore(vertices, edges, k, 1)
  return len(filtered_vertices) == len(vertices)

In [33]:
def list_kcore(vertices, edges, component_ids, k=3, g=5):
  if len(vertices) < g:
    return []

  fixed_vertex = list(vertices.index)[0]
  indices = list(vertices.index)[1:]
  indices = get_connected_component(fixed_vertex, indices, component_ids)
  #return [group for group in itertools.combinations(indices, g) if is_kcore(vertices.loc[list(group)], edges, k)]
  result = []
  for group in itertools.combinations(indices, g-1):
    group = list(group)
    group.append(fixed_vertex)
    if is_kcore(vertices.loc[group], edges, k):
      result.append(group)
  return result

In [34]:
# max should be reached in 5 iterations we hope
def maximum_kcore(published_authors, edges, k=3, rounds=5):
  published_edges = edges
  for i in range(rounds):
    published_edges = [(u,v) for (u,v) in published_edges if int(u) in list(published_authors.index) and int(v) in list(published_authors.index)]
    flattened_edges = [int(e[0]) for e in published_edges]
    flattened_edges.extend(int(e[1]) for e in published_edges)
    degrees = pd.Series(flattened_edges).value_counts()
    filtered_vertices = list((degrees[degrees >= k]).index)
    published_authors = published_authors.loc[filtered_vertices]
  return (published_authors, published_edges)

In [35]:
def filter_by_skyline(kcores, dataframe, skyline_model = dominates_general):
  dataframe = dataframe.drop(columns=['name'])
  skyline_kcores = []

  for candidate_group in kcores:
    dominated = False
    for skyline_group in skyline_kcores:
      if (skyline_model(skyline_group, candidate_group, dataframe)):
        dominated = True
        break
    skyline_kcores = [group for group in skyline_kcores if not skyline_model(candidate_group, group, dataframe)]
    if not dominated:
      skyline_kcores.append(candidate_group)
    
  
  return skyline_kcores

In [36]:
def fast_exhaustive_kcore_listing(dataframe, edges, k=3, g=5):
  filtered_vertices = sort_by_l1(dataframe) # sorting data monotonically
  filtered_vertices, filtered_edges = maximum_kcore(filtered_vertices, edges, k)
  component_ids = get_component_ids(list(filtered_vertices.index), filtered_edges)
  all_kcores = []
  print(f"#vertices: {len(filtered_vertices)}")
  while(len(filtered_vertices) >= g):
    filtered_vertices, filtered_edges = maximum_kcore(filtered_vertices, filtered_edges, k)
    print(len(filtered_vertices))
    all_kcores.extend( list_kcore(filtered_vertices, filtered_edges, component_ids, k, g) )
    filtered_vertices = filtered_vertices.iloc[1:]
  return all_kcores

# Testing

In [37]:
def get_induced_subgraph(included_vertices, all_vertices, edges):
  subgraph_vertices = all_vertices.loc[included_vertices]
  subgraph_edges = [e for e in edges if (int(e[0]) in included_vertices and int(e[1]) in included_vertices)]
  return subgraph_vertices, subgraph_edges

### min_paper_together = 5, k = 4, g = 6

In [38]:
kcores_k4_g6 = fast_exhaustive_kcore_listing(published_authors, edges, 4, 6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vertices['sum'] = vertices.sum(axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vertices.sort_values(by='sum', ascending = not preferMax, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


#vertices: 96
72
65
64


  degrees = pd.Series(flattened_edges).value_counts()
  degrees = pd.Series(flattened_edges).value_counts()
  degrees = pd.Series(flattened_edges).value_counts()
  degrees = pd.Series(flattened_edges).value_counts()
  degrees = pd.Series(flattened_edges).value_counts()
  degrees = pd.Series(flattened_edges).value_counts()
  degrees = pd.Series(flattened_edges).value_counts()
  degrees = pd.Series(flattened_edges).value_counts()
  degrees = pd.Series(flattened_edges).value_counts()
  degrees = pd.Series(flattened_edges).value_counts()


56
55
53
47
46
40
35
30
25
20
15
10
5


In [39]:
kcores_k4_g7 = fast_exhaustive_kcore_listing(published_authors, edges, 4, 7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vertices['sum'] = vertices.sum(axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vertices.sort_values(by='sum', ascending = not preferMax, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


#vertices: 96
72
65
64
56
55
53
47
46
40
35
30
25
20
15
10
5


In [40]:
skyline_min = filter_by_skyline(kcores_k4_g6, published_authors, dominates_min)
len(skyline_min)

9

In [41]:
skyline_sum = filter_by_skyline(kcores_k4_g6, published_authors, dominates_sum)
len(skyline_sum)

9

In [42]:
skyline_permute = filter_by_skyline(kcores_k4_g6, published_authors, dominates_permute)
len(skyline_permute)

12

In [43]:
skyline_general = filter_by_skyline(kcores_k4_g6, published_authors, dominates_general)
len(skyline_general)

10

In [44]:
unique_to_min = [x for x in skyline_min if ((not x in skyline_sum) and (not x in skyline_general) and (not x in skyline_permute))]
unique_to_sum = [x for x in skyline_sum if ((not x in skyline_min) and (not x in skyline_general) and (not x in skyline_permute))]
unique_to_general = [x for x in skyline_general if ((not x in skyline_permute) and (not x in skyline_min) and (not x in skyline_sum))]
unique_to_permute = [x for x in skyline_permute if ((not x in skyline_min) and (not x in skyline_sum) and (not x in skyline_general))]

In [45]:
somewhat_unique_to_min = [x for x in skyline_min if ((not x in skyline_sum) or (not x in skyline_general) or (not x in skyline_permute))]
somewhat_unique_to_sum = [x for x in skyline_sum if ((not x in skyline_min) or (not x in skyline_general) or (not x in skyline_permute))]
somewhat_unique_to_general = [x for x in skyline_general if ((not x in skyline_permute) or (not x in skyline_min) or (not x in skyline_sum))]
somewhat_unique_to_permute = [x for x in skyline_permute if ((not x in skyline_min) or (not x in skyline_sum) or (not x in skyline_general))]

In [46]:
unique_to_min

[]

In [47]:
unique_to_sum

[]

In [48]:
unique_to_general

[]

In [49]:
unique_to_permute

[]

In [50]:
somewhat_unique_to_min

[[81305, 352690, 1026426, 672732, 881959, 166138],
 [20964, 518039, 382290, 1165422, 769651, 773908]]

In [51]:
somewhat_unique_to_sum

[[1022181, 657270, 1219852, 1598048, 862451, 679579],
 [657270, 1219852, 1598048, 1219947, 862451, 679579]]

In [52]:
somewhat_unique_to_general

[[1022181, 657270, 1219852, 1598048, 862451, 679579],
 [657270, 1219852, 1598048, 1219947, 862451, 679579],
 [233195, 130838, 1346058, 797259, 247780, 513870]]

In [53]:
somewhat_unique_to_permute

[[1022181, 657270, 1219852, 1598048, 862451, 679579],
 [657270, 1219852, 1598048, 1219947, 862451, 679579],
 [233195, 130838, 1346058, 797259, 247780, 513870],
 [81305, 352690, 1026426, 672732, 881959, 166138],
 [20964, 518039, 382290, 1165422, 769651, 773908]]

In [54]:
group_vertices_1, group_edges_1 = get_induced_subgraph(somewhat_unique_to_permute[0], published_authors, edges)
group_vertices_2, group_edges_2 = get_induced_subgraph(somewhat_unique_to_permute[1], published_authors, edges)
group_vertices_3, group_edges_3 = get_induced_subgraph(somewhat_unique_to_permute[2], published_authors, edges)
group_vertices_4, group_edges_4 = get_induced_subgraph(somewhat_unique_to_permute[3], published_authors, edges)
group_vertices_5, group_edges_5 = get_induced_subgraph(somewhat_unique_to_permute[4], published_authors, edges)

In [55]:
group_vertices_1

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
1022181,Peer Kröger,0,5,0,0
657270,Matthias Renz,2,3,0,0
1219852,Andreas Züfle,1,1,0,0
1598048,Tobias Emrich,1,1,0,0
862451,Nikos Mamoulis,21,11,0,0
679579,Hans-Peter Kriegel,13,21,2,0


In [56]:
group_vertices_2

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
657270,Matthias Renz,2,3,0,0
1219852,Andreas Züfle,1,1,0,0
1598048,Tobias Emrich,1,1,0,0
1219947,Thomas Bernecker,1,0,0,0
862451,Nikos Mamoulis,21,11,0,0
679579,Hans-Peter Kriegel,13,21,2,0


In [57]:
group_vertices_3

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
233195,Giovanni Grasso,2,0,0,0
130838,Tim Furche,2,0,0,0
1346058,Giorgio Orsi,3,1,0,0
797259,Christian Schallhart,2,0,0,0
247780,Andrew Sellers,1,0,0,0
513870,Georg Gottlob,10,24,0,0


In [58]:
group_vertices_4

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
81305,Riccardo Rosati,1,5,0,0
352690,M. Lenzerini,1,2,0,0
1026426,Giuseppe De Giacomo,3,6,0,0
672732,Antonella Poggi,1,2,0,0
881959,Diego Calvanese,2,7,0,0
166138,Domenico Lembo,1,3,0,0


In [59]:
group_vertices_5

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
20964,Anthony Nguyen,2,0,0,0
518039,Changkyu Kim,2,3,0,0
382290,Pradeep Dubey,6,4,0,0
1165422,Victor W. Lee,2,2,0,0
769651,Nadathur Satish,3,3,0,0
773908,Jatin Chhugani,4,3,0,0


In [60]:
kcores_all =  kcores_k4_g6 + kcores_k4_g7

In [61]:
skyline_min_all = filter_by_skyline(kcores_all, published_authors, dominates_min)
len(skyline_min_all)

10

In [62]:
skyline_sum_all = filter_by_skyline(kcores_all, published_authors, dominates_sum)
len(skyline_sum_all)

10

In [63]:
skyline_general_all = filter_by_skyline(kcores_all, published_authors, dominates_general)
len(skyline_general_all)

10

In [64]:
unique_to_min_all = [x for x in skyline_min_all if ((not x in skyline_sum_all) and (not x in skyline_general_all))]
unique_to_sum_all = [x for x in skyline_sum_all if ((not x in skyline_min_all) and (not x in skyline_general_all))]
unique_to_general_all = [x for x in skyline_general_all if ((not x in skyline_min_all) and (not x in skyline_sum_all))]

In [65]:
somewhat_unique_to_min_all = [x for x in skyline_min_all if ((not x in skyline_sum_all) or (not x in skyline_general_all))]
somewhat_unique_to_sum_all = [x for x in skyline_sum_all if ((not x in skyline_min_all) or (not x in skyline_general_all))]
somewhat_unique_to_general_all = [x for x in skyline_general_all if ((not x in skyline_min_all) or (not x in skyline_sum_all))]

In [66]:
unique_to_min_all

[[81305, 352690, 1026426, 672732, 881959, 166138],
 [20964, 518039, 382290, 1165422, 769651, 773908]]

In [67]:
unique_to_sum_all

[[1022181, 657270, 1219852, 1598048, 862451, 679579],
 [657270, 1219852, 1598048, 1219947, 862451, 679579]]

In [68]:
unique_to_general_all

[[233195, 130838, 1346058, 797259, 247780, 513870],
 [1022181, 657270, 1219852, 1598048, 1219947, 862451, 679579]]

In [69]:
group_vertices_1_prime, group_edges_1_prime = get_induced_subgraph(unique_to_min_all[0], published_authors, edges)
group_vertices_2_prime, group_edges_2_prime = get_induced_subgraph(unique_to_min_all[1], published_authors, edges)
group_vertices_3_prime, group_edges_3_prime = get_induced_subgraph(unique_to_sum_all[0], published_authors, edges)
group_vertices_4_prime, group_edges_4_prime = get_induced_subgraph(unique_to_sum_all[1], published_authors, edges)
group_vertices_5_prime, group_edges_5_prime = get_induced_subgraph(unique_to_general_all[0], published_authors, edges)
group_vertices_6_prime, group_edges_6_prime = get_induced_subgraph(unique_to_general_all[1], published_authors, edges)

In [70]:
group_vertices_1_prime

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
81305,Riccardo Rosati,1,5,0,0
352690,M. Lenzerini,1,2,0,0
1026426,Giuseppe De Giacomo,3,6,0,0
672732,Antonella Poggi,1,2,0,0
881959,Diego Calvanese,2,7,0,0
166138,Domenico Lembo,1,3,0,0


In [71]:
group_vertices_2_prime

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
20964,Anthony Nguyen,2,0,0,0
518039,Changkyu Kim,2,3,0,0
382290,Pradeep Dubey,6,4,0,0
1165422,Victor W. Lee,2,2,0,0
769651,Nadathur Satish,3,3,0,0
773908,Jatin Chhugani,4,3,0,0


In [72]:
group_vertices_3_prime

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
1022181,Peer Kröger,0,5,0,0
657270,Matthias Renz,2,3,0,0
1219852,Andreas Züfle,1,1,0,0
1598048,Tobias Emrich,1,1,0,0
862451,Nikos Mamoulis,21,11,0,0
679579,Hans-Peter Kriegel,13,21,2,0


In [73]:
group_vertices_4_prime

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
657270,Matthias Renz,2,3,0,0
1219852,Andreas Züfle,1,1,0,0
1598048,Tobias Emrich,1,1,0,0
1219947,Thomas Bernecker,1,0,0,0
862451,Nikos Mamoulis,21,11,0,0
679579,Hans-Peter Kriegel,13,21,2,0


In [74]:
group_vertices_5_prime

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
233195,Giovanni Grasso,2,0,0,0
130838,Tim Furche,2,0,0,0
1346058,Giorgio Orsi,3,1,0,0
797259,Christian Schallhart,2,0,0,0
247780,Andrew Sellers,1,0,0,0
513870,Georg Gottlob,10,24,0,0


In [75]:
group_vertices_6_prime

Unnamed: 0,name,VLDB,SIGMOD,ICML,NIPS
1022181,Peer Kröger,0,5,0,0
657270,Matthias Renz,2,3,0,0
1219852,Andreas Züfle,1,1,0,0
1598048,Tobias Emrich,1,1,0,0
1219947,Thomas Bernecker,1,0,0,0
862451,Nikos Mamoulis,21,11,0,0
679579,Hans-Peter Kriegel,13,21,2,0


In [76]:
somewhat_unique_to_min_all

[[81305, 352690, 1026426, 672732, 881959, 166138],
 [20964, 518039, 382290, 1165422, 769651, 773908]]

In [77]:
somewhat_unique_to_sum_all

[[1022181, 657270, 1219852, 1598048, 862451, 679579],
 [657270, 1219852, 1598048, 1219947, 862451, 679579]]

In [78]:
somewhat_unique_to_general_all

[[233195, 130838, 1346058, 797259, 247780, 513870],
 [1022181, 657270, 1219852, 1598048, 1219947, 862451, 679579]]

In [79]:
import itertools
def find_missing_edges(vertices, edges):
    missing_edges = [(x,y) for x,y in list(itertools.combinations(vertices, 2)) if (str(x),str(y)) not in edges and (str(y),str(x)) not in edges]
    return missing_edges

In [80]:
find_missing_edges(list(group_vertices_1.index), group_edges_1)

[(1022181, 862451)]

In [81]:
find_missing_edges(list(group_vertices_2.index), group_edges_2)

[]

In [82]:
find_missing_edges(list(group_vertices_3.index), group_edges_3)

[(1346058, 247780)]

In [83]:
find_missing_edges(list(group_vertices_4.index), group_edges_4)

[(672732, 881959)]

In [84]:
find_missing_edges(list(group_vertices_5.index), group_edges_5)

[(1165422, 769651)]

In [85]:
unique_researchers = pd.concat([group_vertices_1, group_vertices_2, group_vertices_3, group_vertices_4, group_vertices_5]).drop_duplicates()
len(unique_researchers)

25