In [29]:
# Created in 2015-02-26 by Alberto Ueda
# Generates the input to ca-index from the MAS, DBLP data.
%pylab inline

import pandas as pd
import numpy as np
import mmap
import time
import re

from unidecode import unidecode

CS_AREA = 2        
UTF8 = 'utf-8'        

class Dict(dict):
    def __missing__(self, key):
            return False

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


## Step 1: MAS - Venues and Subdomains

In [24]:
conf     = pd.read_csv("../azure/Conference.csv")
conf_cat = pd.read_csv("../azure/Conf_Category.csv")
jour     = pd.read_csv("../azure/Journal.csv")
jour_cat = pd.read_csv("../azure/Jour_Category.csv")
cat      = pd.read_csv("../azure/Category.csv")
dom      = pd.read_csv("../azure/Domain.csv")

# All: Pre-Merge
cat.rename(columns={'Name':'SubDomainName'}, inplace=True)
dom.rename(columns={'Name':'DomainName'}, inplace=True)

# Conferences: Pre-Merge
conf.rename(columns={'ID':'VenueOldID'}, inplace=True)
conf_cat.rename(columns={'ConfCategoryID':'CategoryID'}, inplace=True)

# Conferences: Merges
conf_and_dom = pd.merge(conf.ix[:, :3], conf_cat, 
                        left_on='VenueOldID', 
                        right_on='CategoryID', 
                        how='left')
conf_and_subdom_names = pd.merge(conf_and_dom[conf_and_dom['DomainID'] == CS_AREA], cat, 
                                 left_on=['DomainID', 'SubDomainID'], 
                                 right_on=['DomainID', 'SubDomainID'], 
                                 how='left')
conf_and_dom_names = pd.merge(conf_and_subdom_names, dom,
                              left_on='DomainID', 
                              right_on='ID', 
                              suffixes=['SubDomain','Domain'],
                              how='left')
conf_and_dom_names['Type'] = 'C'

# Journals: Pre-Merge
jour.rename(columns={'ID':'VenueOldID'}, inplace=True)
jour_cat.rename(columns={'CJourID':'CategoryID'}, inplace=True)

# Journals: Merges
jour_and_dom = pd.merge(jour.ix[:, :3], jour_cat, 
                        left_on='VenueOldID', 
                        right_on='CategoryID', 
                        how='left')
jour_and_subdom_names = pd.merge(jour_and_dom[jour_and_dom['DomainID'] == CS_AREA], cat, 
                                 left_on=['DomainID', 'SubDomainID'], 
                                 right_on=['DomainID', 'SubDomainID'], 
                                 how='left')
jour_and_dom_names = pd.merge(jour_and_subdom_names, dom,
                              left_on='DomainID', 
                              right_on='ID', 
                              suffixes=['SubDomain','Domain'],
                              how='left')
jour_and_dom_names['Type'] = 'J'

venues_subdomains = pd.concat([conf_and_dom_names, jour_and_dom_names], ignore_index=True)
venues_subdomains.to_csv('../azure/mas-venue-subdomain.csv')

## Step 2: DBLP - CNPq Author's Publications

In [25]:
# Time
start_time = time.time()

# CNPQ Authors
cnpq_authors = pd.read_csv("../cnpq/prod-levels.csv")
cnpq_authors = cnpq_authors.ix[:, ['Name', 'ShortName', 'CNPqLevel']]
fullnames_list = [unidecode(x.decode(UTF8)).upper() for x in cnpq_authors['Name'].tolist()]
shortnames_list = [unidecode(x.decode(UTF8)).upper() for x in cnpq_authors['ShortName'].tolist()]

# First + last names
simplenames_list = []

# Names Pre-processing
for fullname in fullnames_list:
    names = fullname.split()
    simplename = names[0] + ' ' + names[-1]
    simplenames_list.append(simplename)
    
# DBLP Authors
with open('../dblp/dblp-sorted.pubsfile') as dblp_file, open('../dblp/cnpq-authors-dblpkeys.pubsfile', 'wb') as output:
    selected_lines = []    
    used_simplenames = Dict()
    copy_pubs = False

    for line in dblp_file:

        if '\t' in line:
            if copy_pubs:
                # Adding author's publications
                selected_lines.append(line)
            continue

        # Reseting flags
        copy_pubs = False
        found = False
        dblp_author = unidecode(line.strip('\n').decode(UTF8)).upper()

        # It is a name. Generating simplename for check.
        names = dblp_author.split()
        simplename = names[0] + ' ' + names[-1]

        # Searching the new author             
        # Heuristic: When there are more than one option for name,
        #            choose the one with more publications        
        if used_simplenames[simplename] == False and dblp_author in fullnames_list:
            found = True
        elif used_simplenames[simplename] == False and dblp_author in shortnames_list:
            found = True
            #line = line.strip('\n') + ' (By ShortName)\n'
        elif used_simplenames[simplename] == False and dblp_author in simplenames_list:
            found = True
            #line = line.strip('\n') + ' (By SimpleName)\n'
                
        if not found: 
            continue
        else:
            # Adding author's name
            selected_lines.append(line)
            used_simplenames[simplename] = True
            copy_pubs = True
        
    print("\n--- Finished after %.3f seconds. ---" % (time.time() - start_time))

    for line in selected_lines:
        output.write(line)



--- Finished after 156.600 seconds. ---


## Step 3: DBLP - CNPq Author's Venues

## Step 4: DBLP - Merging DBLP Venues Info with CNPq Venues' Names

In [26]:
vkeys = pd.read_csv('../dblp/cnpq-all-vkeys.txt', header=None)
confs = pd.read_csv('../dblp/dblp-conf-azlist2.csv')
journals = pd.read_csv('../dblp/dblp-journals-azlist2.csv')

# Pre-processing columns names
vkeys.columns = ['VKey']
confs.rename(columns={'Unnamed: 0':'VKey'}, inplace=True)
journals.rename(columns={'Unnamed: 0':'VKey'}, inplace=True)
all_dblp_venues = pd.concat([confs, journals])

# Merge
all_cnpq_authors_venues = pd.merge(vkeys, all_dblp_venues, how='left')
all_cnpq_authors_venues.to_csv('../dblp/cnpq-authors-vkeys.csv')

## Step 5: DBLP & MAS - Venues Conciliation 

In [28]:
dblp = pd.read_csv('../dblp/cnpq-authors-vkeys.csv')
mas = pd.read_csv('../azure/mas-venue-subdomain.csv')

# Pre-processing tables
def normalize(x):
    return unidecode(str(x).decode('utf-8)')).lower()

dblp.rename(columns={'VKey':'DBLPVenueKey', 'NameH1':'DBLPName', 'ShortNameAZList':'DBLPShortName'}, inplace=True)
dblp['DBLPShortName'] = dblp[dblp['DBLPShortName'].notnull()]['DBLPShortName'].apply(lambda x: normalize(x))
dblp['DBLPName'] = dblp['DBLPName'].apply(lambda x: normalize(x))

dblp['DBLPType'] = dblp['DBLPVenueKey'].apply(lambda x: 'C' if x.startswith('conf/') else 'J')
dblp['DBLPMinVKey'] = dblp['DBLPVenueKey'].apply(lambda x: normalize(x).replace("conf/", "").replace("journals/", ""))

mas.rename(columns={'VenueOldID':'MASID', 'FullName':'MASName', 'Type':'MASType', 'ShortName':'MASShortName', 
                    'SubDomainName':'MASSubDomainName'}, inplace=True)
mas = mas.ix[:, ['MASID', 'MASType', 'MASShortName', 'MASName', 'DomainName', 'MASSubDomainName', 'DomainID', 'SubDomainID']]

mas['MASShortName'] = mas[mas['MASShortName'].notnull()]['MASShortName'].apply(lambda x: normalize(x))
mas['MASName'] = mas['MASName'].apply(lambda x: normalize(x))

# For Tests
# dblp = dblp.head(100)
#[dblp['DBLPName'] == 'artificial intelligence in education']

result = pd.DataFrame()
dblp_found = Dict()

for index, row in dblp.iterrows():
    dblp_key = row['DBLPVenueKey']
    
    if dblp_found[dblp_key] == 'True': continue 
    
    # First attempt: By Venue Fullname    
    dblp_name = row['DBLPName']
    mas_rows = mas[mas['MASName'] == dblp_name]
    mas_matches = len(mas_rows)
    
    # Fullname: 1-to-N from DBLP to MAS 
    for index, mas_row in mas_rows.iterrows(): 
        if row['DBLPType'] != mas_row['MASType']: 
            print 'Distinct Type for: ', mas_row['MASName'], '!!!'
            continue

        row['MASID'] = mas_row['MASID']
        row['MASName'] = mas_row['MASName']
        row['MASShortName'] = mas_row['MASShortName']
        row['MASSubDomainName'] = mas_row['MASSubDomainName']
        
        result = result.append(row)

    if mas_matches > 0:
        dblp_found[dblp_key] = 'True'
        continue

    # Second attempt: By Venue Shortname    
    dblp_shortname = row['DBLPShortName']
    mas_rows = mas[mas['MASShortName'] == dblp_shortname]
    mas_matches = len(mas_rows)
    
    # Shortname: 1-to-1 from DBLP to MAS 
    if mas_matches == 1:
        if row['DBLPType'] != mas_rows['MASType'].item(): 
            print 'Distinct Type for: ', mas_rows['MASName'].item(), '!!!'
            continue
            
        row['MASName'] = mas_rows['MASName'].item()
        row['MASID'] = mas_rows['MASID'].item()
        row['MASShortName'] = mas_rows['MASShortName'].item()
        row['MASSubDomainName'] = mas_rows['MASSubDomainName'].item()
        
        result = result.append(row)
        dblp_found[dblp_key] = 'True'
        continue

    # Append empty row if didnt find
    result = result.append(row)
    
# Sort
result.sort('DBLPVenueKey', inplace=True)
#result.drop_duplicates('DBLPVenueKey', inplace=True)
result.reset_index(inplace=True)

# Writing result files
result = result.ix[:, ['DBLPVenueKey', 'MASID', 'MASSubDomainName', 'MASShortName', 'DBLPShortName', 'MASName', 'DBLPName']]
result.to_csv('../conciliated_venues_FULL.csv')

mini_result = result.ix[:, ['DBLPVenueKey', 'MASID', 'MASSubDomainName']]
mini_result.to_csv('../conciliated_venues.csv')

Distinct Type for:  cluster computing !!!
Distinct Type for:  human-computer interaction !!!
Distinct Type for:  computational intelligence !!!
Distinct Type for:  intelligent data analysis !!!
Distinct Type for:  requirements engineering !!!
Distinct Type for:  operating systems review !!!
Distinct Type for:  automated software engineering !!!
Distinct Type for:  automated software engineering !!!
Distinct Type for:  computational intelligence !!!
Distinct Type for:  cluster computing !!!
Distinct Type for:  intelligent data analysis !!!
Distinct Type for:  operations research !!!
Distinct Type for:  information retrieval !!!
Distinct Type for:  language resources and evaluation !!!
Distinct Type for:  multimedia systems !!!
Distinct Type for:  parallel computing !!!
Distinct Type for:  requirements engineering !!!
Distinct Type for:  theoretical computer science !!!
Distinct Type for:  vlsi design !!!
Distinct Type for:  virtual reality !!!


## Step 6: DBLP Paper -> MAS SubArea

In [37]:
venues = pd.read_csv('../conciliated_venues.csv')
paper_area = pd.DataFrame({}, [], columns=['Paper', 'Venue'])
lines = []

# Adding all CNPq author's publications
with open('../dblp/cnpq-authors-dblpkeys.pubsfile') as dblp_file:
    for line in dblp_file:
        if '\t' not in line: continue
        lines.append(line[1:-1])    

paper_area['Paper'] = lines
paper_area['Venue'] = paper_area['Paper'].apply(
    lambda x: "" if re.search(r'(.*/.*)/', x) is None else re.search(r'(.*/.*)/', x).group(1)
)

paper_area = pd.merge(paper_area, venues, left_on='Venue', right_on='DBLPVenueKey', how='left')
paper_area.rename(columns={'MASSubDomainName':'Area'}, inplace=True)
paper_area.sort('Area', inplace=True)
paper_area.reset_index(inplace=True)

paper_area = paper_area.ix[:, ['Paper', 'Area']]
paper_area.to_csv('../paper-areas.csv')

## Step 7: Scholar - CNPq Author's H-Indexes

In [39]:
# H-Index File
h = pd.read_csv('../cnpq/prod-levels.csv')
h.rename(columns={'Name':'Author', 'H-index':'HIndex'}, inplace=True)
h = h.ix[:, ['Author', 'HIndex']]
h['HIndex'] = h[h['HIndex'].notnull()]['HIndex'].apply(lambda x: str(int(x)))
h.to_csv('../author-hindex.csv')