In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations
import re

## coauth-dblp

In [2]:
author_df = pd.read_csv("../RawData/dblp-gender-tsvs/authors.tsv", sep='\t')
general_df = pd.read_csv("../RawData/dblp-gender-tsvs/general.tsv", sep='\t')
affiliation_df = pd.read_csv("../RawData/dblp-gender-tsvs/affiliation.tsv", sep='\t')

In [3]:
## Give each name a unique id
ids = {name: i for i, name in enumerate(pd.unique(author_df['name']))}

In [6]:
label_df = author_df[['name', 'gender']].drop_duplicates().rename(columns={'gender': 'group_code'})
label_df['id'] = author_df['name'].map(ids)
label_df['group_code'] = label_df['group_code'].map({
    "M": 0,
    "F": 1,
    "-": -1
})
every_og_node = list(label_df['id'])
good_nodes = list(label_df[label_df['group_code'] != -1]['id'])
label_df = label_df[label_df['id'].isin(good_nodes)]
label_df[['id','group_code']].to_csv('../Data/coauth-dblp/labels.csv', index=False)

In [7]:
is_node_good = {k:False for k in every_og_node}
for k in good_nodes:
    is_node_good[k] = True

In [8]:
## Generate edges and triangles with year
full_df = pd.merge(author_df, general_df, on='k', how='left')
full_df['node_id'] = full_df['name'].map(ids)
elist = []
tlist = []
slist = []
for paper, df in full_df.groupby('k'):
    all_nodes = list(df['node_id'])
    all_nodes = [i for i in all_nodes if is_node_good[i]]
    if(len(all_nodes) > 1):
        year = df['year'].iloc[0]
        for e in combinations(all_nodes, 2):
            elist.append((e[0], e[1], year))
        for t in combinations(all_nodes, 3):
            tlist.append((t[0], t[1], t[2], year))
        slist.append(all_nodes + [year])

In [9]:
e_df = pd.DataFrame(elist)
e_df.columns = ['node_1', 'node_2', 't']
e_df = e_df[['node_1', 'node_2']].drop_duplicates()
e_df.to_csv('../Data/coauth-dblp/edges.csv', index=False)

t_df = pd.DataFrame(tlist)
t_df.columns = ['node_1', 'node_2', 'node_3', 't']
t_df = t_df[['node_1', 'node_2', 'node_3']].drop_duplicates()
t_df.to_csv('../Data/coauth-dblp/triangles.csv', index=False)

with open('../Data/coauth-dblp/simplices.csv', 'w') as f:
    for item in slist:
        f.write("%s\n" % ",".join([str(i) for i in item]))