In [256]:
%pylab inline

import pandas as pd
import numpy as np

from unidecode import unidecode

class Dict(dict):
    def __missing__(self, key):
            return False

Populating the interactive namespace from numpy and matplotlib


In [409]:
dblp = pd.read_csv('../cnpq-authors-vkeys.csv')
mas = pd.read_csv('../mas-venue-subdomain.csv')

# Pre-processing tables
def normalize(x):
    return unidecode(str(x).decode('utf-8)')).lower()

dblp.rename(columns={'VKey':'DBLPVenueKey', 'NameAZList':'DBLPName', 'ShortNameAZList':'DBLPShortName'}, inplace=True)
dblp['DBLPShortName'] = dblp[dblp['DBLPShortName'].notnull()]['DBLPShortName'].apply(lambda x: normalize(x))
dblp['DBLPName'] = dblp['DBLPName'].apply(lambda x: normalize(x))

dblp['DBLPType'] = dblp['DBLPVenueKey'].apply(lambda x: 'C' if x.startswith('conf/') else 'J')
dblp['DBLPMinVKey'] = dblp['DBLPVenueKey'].apply(lambda x: normalize(x).replace("conf/", "").replace("journals/", ""))

mas.rename(columns={'VenueOldID':'MASID', 'FullName':'MASName', 'Type':'MASType', 'ShortName':'MASShortName', 
                    'SubDomainName':'MASSubDomainName'}, inplace=True)
mas = mas.ix[:, ['MASID', 'MASType', 'MASShortName', 'MASName', 'DomainName', 'MASSubDomainName', 'DomainID', 'SubDomainID']]

mas['MASShortName'] = mas[mas['MASShortName'].notnull()]['MASShortName'].apply(lambda x: normalize(x))
mas['MASName'] = mas['MASName'].apply(lambda x: normalize(x))

# For Tests
# dblp = dblp.head(100)

#[dblp['DBLPName'] == 'artificial intelligence in education']

In [410]:
result = pd.DataFrame()
dblp_found = Dict()

for index, row in dblp.iterrows():
    dblp_key = row['DBLPVenueKey']
    
    if dblp_found[dblp_key] == 'True': continue 
    
    # First attempt: By Venue Fullname    
    dblp_name = row['DBLPName']
    mas_rows = mas[mas['MASName'] == dblp_name]
    mas_matches = len(mas_rows)
    
    # Fullname: 1-to-N from DBLP to MAS 
    for index, mas_row in mas_rows.iterrows(): 
        if row['DBLPType'] != mas_row['MASType']: 
            print 'Distinct Type for: ', mas_row['MASName'], '!!!'
            continue

        row['MASID'] = mas_row['MASID']
        row['MASName'] = mas_row['MASName']
        row['MASShortName'] = mas_row['MASShortName']
        row['MASSubDomainName'] = mas_row['MASSubDomainName']
        
        result = result.append(row)

    if mas_matches > 0:
        dblp_found[dblp_key] = 'True'
        continue

    # Second attempt: By Venue Shortname    
    dblp_shortname = row['DBLPShortName']
    mas_rows = mas[mas['MASShortName'] == dblp_shortname]
    mas_matches = len(mas_rows)
    
    # Shortname: 1-to-1 from DBLP to MAS 
    if mas_matches == 1:
        if row['DBLPType'] != mas_rows['MASType'].item(): 
            print 'Distinct Type for: ', mas_rows['MASName'].item(), '!!!'
            continue
            
        row['MASName'] = mas_rows['MASName'].item()
        row['MASID'] = mas_rows['MASID'].item()
        row['MASShortName'] = mas_rows['MASShortName'].item()
        row['MASSubDomainName'] = mas_rows['MASSubDomainName'].item()
        
        result = result.append(row)
        dblp_found[dblp_key] = 'True'
        continue

    # Append empty row if didnt find
    result = result.append(row)
    
# Sort
result.sort('DBLPVenueKey', inplace=True)
#result.drop_duplicates('DBLPVenueKey', inplace=True)
result.reset_index(inplace=True)

# Writing result files
result = result.ix[:, ['DBLPVenueKey', 'MASID', 'MASSubDomainName', 'MASShortName', 'DBLPShortName', 'MASName', 'DBLPName']]
result.to_csv('../conciliated_venues_FULL.csv')

mini_result = result.ix[:, ['DBLPVenueKey', 'MASID', 'MASSubDomainName']]
mini_result.to_csv('../conciliated_venues.csv')

Distinct Type for:  human-computer interaction !!!
Distinct Type for:  computational intelligence !!!
Distinct Type for:  intelligent data analysis !!!
Distinct Type for:  mobile information systems !!!
Distinct Type for:  mobile information systems !!!
Distinct Type for:  language resources and evaluation !!!
Distinct Type for:  language resources and evaluation !!!
Distinct Type for:  parallel computing !!!
Distinct Type for:  pervasive and mobile computing !!!
Distinct Type for:  operating systems review !!!
Distinct Type for:  sigplan notices !!!
Distinct Type for:  computational intelligence !!!
Distinct Type for:  cluster computing !!!
Distinct Type for:  intelligent data analysis !!!
Distinct Type for:  operations research !!!
Distinct Type for:  information retrieval !!!
Distinct Type for:  language resources and evaluation !!!
Distinct Type for:  multimedia systems !!!
Distinct Type for:  parallel computing !!!
Distinct Type for:  requirements engineering !!!
Distinct Type for

In [None]:
result

In [183]:
# H-Index File
h = pd.read_csv('../cnpq/prod-levels.csv')
h.rename(columns={'Name':'Author', 'H-index':'HIndex'}, inplace=True)
h = h.ix[:, ['Author', 'HIndex']]
h['HIndex'] = h[h['HIndex'].notnull()]['HIndex'].apply(lambda x: str(int(x)))
#h.to_csv('../author-hindex.csv')

In [252]:
#result[result.groupby('DBLPName').count() > 1]