### CN-Infocom Clusters

created on 2016-04-04 by Alberto Ueda

In [2]:
%pylab inline
%matplotlib inline

import pandas as pd
import numpy as np
import mmap
import time
import re

from unidecode import unidecode
UTF8 = 'utf-8'        

class Dict(dict):
    def __missing__(self, key):
            return False

#reload(sys)  # Solve the acentos problem, but create the terminal output problem
#sys.setdefaultencoding(UTF8)

mypath = '/mnt/hd0/alberto/Dropbox/ufmg-not-code/datasets/dblp/cn-infocom-clusters/' # UFMG
# mypath = '/home/alberto/Dropbox/ufmg-not-code/datasets/dblp/cn-infocom-clusters/' # Home

Populating the interactive namespace from numpy and matplotlib


### Loading Coauthors (Must be executed once)

In [3]:
# Filtering by number of publications
min_publications = 20
a_num_papers = pd.read_csv(mypath + '../author_num_papers.csv')
filtered_authors = a_num_papers[a_num_papers['n_papers'] > min_publications]['author'].tolist()
highest_author_id = len(filtered_authors)
print ('highest_author_id: ', highest_author_id)

# Load coauthors   # full exec. time ~2min
max_coauthors = 100
# highest_author_id = 170049

coauthors = Dict()
with open(mypath + '../author-coauthors.tsv') as f:        
    k = 0    
    for line in f.readlines():          
        if k % 100000 == 0: print ('Processed', k , 'rows of coauthorship...')    
        k+=1       
        
        authors = line.split()        
        author = int(authors[0])
        
        if (author > highest_author_id): break
        
        # Limiting the max number of coauthors
        max_coauthors = max_coauthors if len(authors) > max_coauthors else len(authors)
        
        for i in range(1, len(authors)): #len(authors) (unlimited) or max_coauthors             
            coauthor = int(authors[i])
            
            if (coauthor > highest_author_id): continue
                
            if coauthors[author] == False: coauthors[author] = []            
            coauthors[author].append(coauthor)    

print ('Coauthors loaded.')    

highest_author_id:  79800
Processed 0 rows of coauthorship...
Coauthors loaded.


### Filtering by Venue (Bash)

### Loading Venue Authors Data

In [4]:
venue_name = 'wsdm'

venue_top_authors_file = venue_name + '_authors.csv.sorted'
output_path = mypath + venue_name + '/'

a_p = pd.read_csv(mypath + "../authorpaper.csv", names=['author','paper'])#, nrows=17220)
all_names = pd.read_csv(mypath + "../all_names.csv")
venue_p = pd.read_csv(output_path + "papers.csv." + venue_name, header=None, names=['paper', 'conf', 'year', 'x'], sep='\t')#, nrows=17220)
print ('venue_p:\n', venue_p.head(), '\n')

venue_a = pd.merge(venue_p, a_p, how='left', on='paper')
venue_a = pd.merge(venue_a, all_names, left_on='author', right_on='id', how='left')
venue_a_unique = venue_a[['id','label']].drop_duplicates()

for index, row in venue_a_unique.iterrows(): # try to use 'apply' later :)   
    a_score = len(venue_a[venue_a['id'] == row['id']])       
    venue_a_unique.set_value(index, 'score', a_score)
    
venue_a_unique.sort_values(by='score', inplace=True, ascending=False)
venue_a_unique.to_csv(output_path + venue_top_authors_file, index=False)
print ('venue_a_unique:\n', venue_a_unique.head())

venue_p:
    paper                    conf  year    x
0   5025       conf/wsdm/LuXKY14  2014  994
1   5292     conf/wsdm/ZhangKY14  2014  994
2   5554  conf/wsdm/AgarwalLTY08  2008  994
3   5680      conf/wsdm/DingLY08  2008  994
4   5758    conf/wsdm/ZhangWWY13  2013  994 

venue_a_unique:
         id                label  score
95   11005             Yi Chang     11
494   1090           Ravi Kumar     11
858   3467      Susan T. Dumais     11
621  17346  Sreenivas Gollapudi     10
700   9326     Vanja Josifovski     10


In [6]:
venue_a.head() # Authors id and label check 

Unnamed: 0,paper,conf,year,x,author,id,label
0,5025,conf/wsdm/LuXKY14,2014,994,5,5,Philip S. Yu
1,5025,conf/wsdm/LuXKY14,2014,994,25868,25868,Xiangnan Kong
2,5025,conf/wsdm/LuXKY14,2014,994,76555,76555,Sihong Xie
3,5025,conf/wsdm/LuXKY14,2014,994,491641,491641,Chun-Ta Lu
4,5292,conf/wsdm/ZhangKY14,2014,994,5,5,Philip S. Yu


### Generating the Graph of Interest

In [7]:
n_top_authors = 50
levels = 2
tmp_file = 'tmp'

# Selecting the first n top authors
top_authors = pd.read_csv(output_path + venue_top_authors_file)
list_venue_top_a = top_authors['id'].head(n_top_authors).tolist()
# or by author id
# list_top = top_authors[top_authors['label'].str.contains('Edmundo')]['id'].head(n_authors).tolist()
print ("Writing coauthorship edges [", venue_name, "]...")

with open(output_path + tmp_file, 'w') as f:        
    found = Dict()        
    all_authors_found = []
    new_authors = list_venue_top_a    
    f.write('SOURCE,TARGET,TYPE\n')
        
    for i in range(1, levels):
        #f.write('\nlevel:' + str(i+1) + '\n')
        authors = new_authors
        new_authors = []

        for author in authors:            
            if coauthors[author] == False or found[author] == True: continue           
            all_authors_found.append(author)    

            for coauthor in coauthors[author]:      
                
                # Left ID always lower than Right ID
                if (author < coauthor):
                    f.write(str(author) + ',' + str(coauthor) + ',Undirected\n')
                else:
                    f.write(str(coauthor) + ',' + str(author) + ',Undirected\n')
                    
                new_authors.append(coauthor)   
                all_authors_found.append(coauthor)

            # if (author > highest_author_id): print ('Warning: the author should not be here:', author)
            found[author] = True


# Names of the authors of interest; highlighting the top authors from the list
print ("Writing authors nodes...")
nodes = pd.read_csv(mypath + "../all_names.csv")
nodes = nodes[nodes['id'].isin(all_authors_found)]
nodes['top'] = 0
nodes.ix[nodes['id'].isin(list_venue_top_a), 'top'] = 1
nodes.to_csv(output_path + "nodes.csv", index=False)     

# Edge Weights (based on number of coauthorships)
print ("Assigning the correct weights to the edges...")
weights = pd.read_csv(mypath + '../coauthorships.csv', names=['SOURCE', 'TARGET', 'WEIGHT'], skiprows=1)
edges = pd.read_csv(output_path + tmp_file)

# We could apply functions to balance the weights
weights['WEIGHT'] = weights['WEIGHT'].apply(lambda x: (log(x) + 1))

edges = pd.merge(edges, weights)
edges.drop_duplicates(inplace=True)
edges.to_csv(output_path + 'edges.csv', index=False)

# MCL Input
print ("Generating MCL input...")
edges[['SOURCE', 'TARGET', 'WEIGHT']].to_csv(output_path + 'mcl_input.txt', index=False, header=None, sep=' ')

print ("N:", len(nodes), " E:", len(edges))
print ("Process has finished.")

Writing coauthorship edges [ wsdm ]...
Writing authors nodes...
Assigning the correct weights to the edges...
Generating MCL input...
N: 3179  E: 4891
Process has finished.


### Reweighting only

In [11]:
old = pd.read_csv(output_path + 'edges.csv')
old['WEIGHT'] = old['WEIGHT'].apply(lambda x: (log(x) + 1))
# old.to_csv(output_path + 'coauthorships_of_interest.csv.weight-log')
old.head()

Unnamed: 0,SOURCE,TARGET,TYPE,WEIGHT
0,32,211,Undirected,2.098612
1,55,211,Undirected,1.0
2,98,211,Undirected,3.397895
3,193,211,Undirected,2.098612
4,211,10458,Undirected,1.693147


### Using the MCL Output to generate Gephi Input

In [18]:
# Remember to change this accordingly
# output_path = mypath + "cn/"

nodes = pd.read_csv(output_path + "nodes.csv")

with open(output_path + 'mcl_output.txt') as f:        
    for line in f.readlines():               
        authors = [int(x) for x in line.split()]
        cluster_id = int(authors[0])
        nodes.ix[nodes['id'].isin(authors), 'mcl_cluster'] = cluster_id 
        
print("MCL clusters assigned to authors of [", output_path, "]")

nodes[['id', 'label', 'top', 'mcl_cluster']].to_csv(output_path + "nodes.csv", index=False)     
nodes.head()

MCL clusters assigned to authors of [ /mnt/hd0/alberto/Dropbox/ufmg-not-code/datasets/dblp/cn-infocom-clusters/cn/ ]


Unnamed: 0,id,label,top,mcl_cluster
0,0,H. Vincent Poor,0,1411
1,1,Wei Wang,0,1915
2,2,Yan Zhang,0,1333
3,3,Wei Liu,0,3
4,13,Yang Yang,0,1915
