Created by Alberto Ueda on 2017-02-17

## Loading Libs and Data

In [1]:
N_AUTHORS = None

%pylab inline

import sys
sys.path.append('../../../../../ufmg-latin/pscore/rfs/')

import rfslib.rankmodels
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

import rfslib.datasets
dataset = rfslib.datasets.DBLPDataset('../../../../../pscore/datasets/',nauthors=N_AUTHORS)
dataset.prepare()
venuerank  = dataset.dfvenues.copy()
authorrank = dataset.dfauthors.copy()
grouprank  = dataset.dfgroups.copy()

Populating the interactive namespace from numpy and matplotlib
(1595771, 1) authors
(1595771, 1) authors loaded


In [2]:
import rfslib.pscorefactory
pscore = rfslib.pscorefactory.PScoreRankFactory(dataset,cpp=True,ranksize=100,fullrank=False)

## Filtering Venues

In [3]:
# %%time
vap = dataset.dfvenuesauthorpaper.copy()

def live_venues_in(year):
    return vap[vap.Year >= year].drop_duplicates('Vkey').Vkey.tolist()

def dead_venues_in(year):
    live = live_venues_in(year)
    dead = vap[~vap.Vkey.isin(live)].drop_duplicates('Vkey').Vkey.tolist()
    return dead

def venues_with_min_psize(min_papers):
    count = vap.drop_duplicates('PID').groupby('Vkey').PID.count()
    return count[count > min_papers].index.tolist()

def valid_vkeys(year, min_papers):
    big = set(venues_with_min_psize(min_papers))
    dead = set(dead_venues_in(year))
    
    return big.difference(dead)
    
valid_vkeys(2014, 10000)

{'conf/chi',
 'conf/globecom',
 'conf/hci',
 'conf/hicss',
 'conf/icassp',
 'conf/icc',
 'conf/icip',
 'conf/icra',
 'conf/igarss',
 'conf/interspeech',
 'conf/iros',
 'conf/iscas',
 'conf/vtc',
 'journals/amc',
 'journals/bioinformatics',
 'journals/cacm',
 'journals/corr',
 'journals/dm',
 'journals/ieicet',
 'journals/tcs',
 'journals/tit',
 'journals/tsp'}

In [4]:
%%time
valid_venues = valid_vkeys(year=2012, min_papers=100)
print len(valid_venues)

2690
CPU times: user 2.91 s, sys: 220 ms, total: 3.13 s
Wall time: 3.13 s


## Venues' Qualis, h-index and citations

In [5]:
import venue.qualis
from venue.qualis import qualis

venuerank['Qualis'] = venuerank.Vkey.map(lambda vkey: qualis.get_qualis_dblp('dblp:'+vkey))
venuerank['QualisRel'] = pd.Series()
venuerank['QualisRel'] = venuerank['Qualis'].replace(
    {'A1':7,'A2':6,'B1':5,'B2':4,'B3':3,'B4':2,'B5':1,'C ':0,np.nan:0})
print venuerank['QualisRel'].unique()

dfgsvenues = pd.read_csv('../data/dblp-gs-venues.csv',index_col=0)
dfgsvenues.rename(columns={'GSHindex':'H-Index'},inplace=True)
if 'H-Index' in venuerank.columns:
    del venuerank['H-Index']
venuerank = venuerank.reset_index().set_index('Vkey').join(dfgsvenues['H-Index']).reset_index().set_index('index')

vkey_cits = pd.read_csv('../data/vkey_cits.csv',index_col=0,names=['Vkey','Vcits']) # TODO by year
venuerank = venuerank.reset_index().set_index('Vkey')
venuerank['Citations'] = vkey_cits['Vcits']
venuerank = venuerank.reset_index().set_index('index')

venuerank.sort_values(by='H-Index',ascending=False).head()

  df['Key'] = df.Sigla.apply( lambda s : unidecode(s.strip('\xc2\xa0')).upper() )
  title = unidecode(title)


(1703, 5)
1703
(2278, 6)
(1317, 6)
(1629, 6)
[ 0.  5.  7.  6.  3.  4.  2.  1.]


Unnamed: 0_level_0,Vkey,VPsize,VPyear,Qualis,QualisRel,H-Index,Citations
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5763,journals/nature,1.0,2007.0,,0.0,355.0,
5718,journals/science,1.0,1983.0,,0.0,311.0,2481.0
587,journals/nar,5753.0,1975.0,B1,5.0,164.0,803995.0
4714,conf/and,32.0,2009.0,,0.0,162.0,142.0
5246,conf/sci,28.0,1999.0,,0.0,136.0,84.0


In [6]:
# Returns 3 rankings: venues, authors and groups
# Calculate the rankings based on previous rankings, 
def run_pscore(params,venuerank,authorrank,grouprank,field):
    
    dranks = pscore.rank(params)
    vrank,arank,grank = dranks['venues'],dranks['authors'],dranks['groups']
    vscore,ascore,gscore = vrank['Score'],arank['Score'],grank['Score']

    # add new score to dfrank with the name 'field' 
    rfslib.rankmodels.addscore(venuerank, vscore,field)
    rfslib.rankmodels.addscore(authorrank,ascore,field)
    rfslib.rankmodels.addscore(grouprank, gscore,field)
    
    return vrank,arank,grank

def head_ranks(n=5): 
    print vrank.sort_values(by='Score',ascending=False).head(n), '\n\n', 
    print arank.sort_values(by='Score',ascending=False).head(n), '\n\n', 
    print grank.sort_values(by='Score',ascending=False).head(n)

## Ranking Functions

In [116]:
# Returns 3 rankings: venues, authors and groups
# Calculate the rankings based on previous rankings, 
def run_pscore(params,venuerank,authorrank,grouprank,field):
    
    dranks = pscore.rank(params)
    vrank,arank,grank = dranks['venues'],dranks['authors'],dranks['groups']
    vscore,ascore,gscore = vrank['Score'],arank['Score'],grank['Score']

    # add new score to dfrank with the name 'field' 
    rfslib.rankmodels.addscore(venuerank, vscore,field)
    rfslib.rankmodels.addscore(authorrank,ascore,field)
    rfslib.rankmodels.addscore(grouprank, gscore,field)

    return vrank,arank,grank

def rank_subarea(subarea, seeds, n=0, only_valid_venues=False):
    refvenues = [ dataset.dfvenues[dataset.dfvenues.Vkey==vkey].index[0] for vkey in seeds ]
    params = dict(area=1,model='P-score',year=(1940,2015),refvenues=refvenues,top_authors=200)

    run_pscore(params, venuerank, authorrank, grouprank, subarea)
    
    rankings = concat_rankings(subarea, venuerank, n, only_valid_venues)
    rankings.to_csv('output/subareas_rankings/' + subarea + '.csv')    
    
    return rankings

In [144]:
# Rank the venues using two different features and concatenate the rankings side-by-side
def concat_rankings(subarea, venuerank, n, only_valid_venues=False):

    # venuerank = venuerank[venuerank['Vkey'] == 'conf/icip'] # For quick tests
    
    pscorerank = venuerank[['Vkey', 'VPsize', 'Citations', 'Qualis', 'H-Index', subarea]
                          ].sort_values(by=subarea, ascending=False)

    # Filtering Venues by recency and min-publications
    if (only_valid_venues):
        pscorerank = pscorerank[pscorerank.Vkey.isin(valid_venues)]
    
    # P-score ranking (2nd to appear)
    pscorerank.index = range(1, pscorerank.shape[0] + 1)
    pscorerank['Rank'] = pscorerank.index
    pscorerank.rename(columns={subarea:'P-score'}, inplace=True)
    pscorerank[''] = ""
    emptycol = pscorerank['']

    # H-index ranking (same venues ranked by previous P-score) (1st to appear)
    hindexrank = pscorerank.sort_values(by='H-Index', ascending=False)
    hindexrank['Rank'] = range(1, hindexrank.shape[0] + 1)
    hindexrank.set_index('Rank', inplace=True)
    hindexrank.rename(columns={'P-score':'P-score_'}, inplace=True)
    pscorerank = pscorerank[['Rank', 'Vkey', 'Qualis', 'P-score']]
    all_rankings = pd.concat([hindexrank, pscorerank], axis=1)

    # 3rd raking: P-score / N. Publications 
    psvpranking = hindexrank
    psvpranking['PS/VP'] = psvpranking['P-score_'] / psvpranking['VPsize']
    psvpranking = psvpranking.sort_values(by='PS/VP', ascending=False)
    psvpranking.index = range(1, psvpranking.shape[0] + 1)
    psvpranking['Rank'] = psvpranking.index
    psvpranking = psvpranking[['Rank', 'Vkey', 'Qualis', 'PS/VP']]
    all_rankings = pd.concat([all_rankings, emptycol, psvpranking], axis=1)
    del all_rankings['P-score_']
    
    return all_rankings.head(n)

In [118]:
def rank(subarea_seeds):
        
    # Testing Vkeys
    keys_for_test = [vkey for sublist in subarea_seeds.values() for vkey in sublist]
    dead_venues = dead_venues_in(2010)
    big_venues = venues_with_min_psize(100)

    for vkey in keys_for_test:
        if (vkey not in dataset.dfvenues.Vkey.values):
            print 'Venue not found:', vkey         

        elif (vkey in dead_venues):
            print 'Dead venue:', vkey     

        elif (vkey not in big_venues):
            print 'Venue without a min of papers:', vkey            

        else:
            print vkey, ": OK"

    rankingsize = 1000
    subarea = subarea_seeds.items()[0][0]
    seeds = subarea_seeds.items()[0][1]
        
    return rank_subarea(subarea, seeds, rankingsize, True)#.ix[:,6:].copy()

## Rank!

In [None]:
# Main Conferences
#   'conf/cvpr'
#   'journals/pami'
#   'conf/iccv'
#   'journals/ijcv'

### conf/cvpr and journals/pami

In [140]:
subarea_seeds = {
    'Computer vision (Tests 1)':['conf/cvpr', 'journals/pami'],
}

rank(subarea_seeds).head(50)

conf/cvpr : OK
journals/pami : OK


Unnamed: 0_level_0,Vkey,VPsize,Citations,Qualis,H-Index,Unnamed: 6_level_0,Rank,Vkey,Qualis,P-score,Unnamed: 11_level_0,Rank,Vkey,Qualis,PS/VP
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,journals/nar,5753.0,803995.0,B1,164.0,,1,conf/cvpr,A1,3458.0,,1,conf/iccv,A1,0.506304
2,conf/balt,232.0,827.0,,132.0,,2,journals/pami,A1,1693.0,,2,conf/cvpr,A1,0.503055
3,conf/cvpr,6874.0,379400.0,A1,118.0,,3,conf/iccv,A1,1526.0,,3,journals/ijcv,A1,0.444699
4,journals/cce,2581.0,35527.0,,118.0,,4,conf/eccv,A1,1443.0,,4,conf/eccv,A1,0.433464
5,journals/pr,6744.0,201239.0,A1,118.0,,5,conf/icpr,A1,1144.0,,5,journals/pami,A1,0.333728
6,journals/cma,4572.0,33510.0,,118.0,,6,conf/icip,A1,1104.0,,6,conf/emmcvpr,,0.287356
7,journals/cee,1272.0,4934.0,,118.0,,7,journals/ijcv,A1,776.0,,7,conf/bmvc,A2,0.268562
8,journals/neuroimage,6390.0,202819.0,,117.0,,8,conf/bmvc,A2,680.0,,8,conf/fgr,,0.26581
9,conf/mmar,518.0,135.0,,113.0,,9,journals/pr,A1,572.0,,9,conf/wacv,B1,0.263484
10,conf/icete,769.0,1194.0,B4,94.0,,10,conf/miccai,A1,540.0,,10,journals/cviu,A1,0.236008


### conf/icip and conf/cvpr

In [145]:
subarea_seeds = {
    'Computer vision (Tests 2)':['conf/icip', 'conf/cvpr'],
}

rank(subarea_seeds).head(20)

conf/icip : OK
conf/cvpr : OK


Unnamed: 0_level_0,Vkey,VPsize,Citations,Qualis,H-Index,Unnamed: 6_level_0,Rank,Vkey,Qualis,P-score,Unnamed: 11_level_0,Rank,Vkey,Qualis,PS/VP
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,journals/nar,5753.0,803995.0,B1,164.0,,1,conf/icip,A1,4675.0,,1,conf/cvpr,A1,0.346087
2,conf/balt,232.0,827.0,,132.0,,2,conf/cvpr,A1,2379.0,,2,conf/iccv,A1,0.339748
3,journals/cma,4572.0,33510.0,,118.0,,3,conf/icassp,A1,1906.0,,3,journals/ijcv,A1,0.297994
4,journals/pr,6744.0,201239.0,A1,118.0,,4,journals/tip,A1,1357.0,,4,conf/eccv,A1,0.285972
5,conf/cvpr,6874.0,379400.0,A1,118.0,,5,conf/icmcs,B3,1070.0,,5,conf/pcs,,0.278095
6,journals/cce,2581.0,35527.0,,118.0,,6,journals/pami,A1,1049.0,,6,conf/icip,A1,0.270497
7,journals/cee,1272.0,4934.0,,118.0,,7,conf/icpr,A1,1032.0,,7,journals/tip,A1,0.265454
8,journals/neuroimage,6390.0,202819.0,,117.0,,8,conf/iccv,A1,1024.0,,8,journals/tcsv,A1,0.250746
9,conf/mmar,518.0,135.0,,113.0,,9,conf/eccv,A1,952.0,,9,conf/mmsp,B2,0.233948
10,conf/icete,769.0,1194.0,B4,94.0,,10,journals/tcsv,A1,756.0,,10,conf/emmcvpr,,0.232759


### conf/cvpr and journals/pami and conf/icip

In [141]:
subarea_seeds = {
    'Computer vision (Tests 3)':['conf/cvpr', 'journals/pami', 'conf/icip'],
}

rank(subarea_seeds).head(50)

conf/cvpr : OK
journals/pami : OK
conf/icip : OK


Unnamed: 0_level_0,Vkey,VPsize,Citations,Qualis,H-Index,Unnamed: 6_level_0,Rank,Vkey,Qualis,P-score,Unnamed: 11_level_0,Rank,Vkey,Qualis,PS/VP
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,journals/nar,5753.0,803995.0,B1,164.0,,1,conf/icip,A1,4166.0,,1,conf/cvpr,A1,0.387547
2,conf/balt,232.0,827.0,,132.0,,2,conf/cvpr,A1,2664.0,,2,conf/iccv,A1,0.383875
3,journals/cce,2581.0,35527.0,,118.0,,3,conf/icassp,A1,1640.0,,3,journals/ijcv,A1,0.330659
4,journals/cee,1272.0,4934.0,,118.0,,4,journals/pami,A1,1440.0,,4,conf/eccv,A1,0.316612
5,journals/pr,6744.0,201239.0,A1,118.0,,5,journals/tip,A1,1292.0,,5,journals/pami,A1,0.283856
6,journals/cma,4572.0,33510.0,,118.0,,6,conf/iccv,A1,1157.0,,6,journals/tip,A1,0.252739
7,conf/cvpr,6874.0,379400.0,A1,118.0,,7,conf/icpr,A1,1116.0,,7,conf/emmcvpr,,0.25
8,journals/neuroimage,6390.0,202819.0,,117.0,,8,conf/eccv,A1,1054.0,,8,conf/icip,A1,0.241046
9,conf/mmar,518.0,135.0,,113.0,,9,conf/icmcs,B3,987.0,,9,journals/tcsv,A1,0.236484
10,conf/icete,769.0,1194.0,B4,94.0,,10,journals/tcsv,A1,713.0,,10,conf/pcs,,0.23619


In [138]:
subarea_seeds = {
    'Computer vision (Tests 1)':['conf/cvpr', 'journals/pami', 'conf/iccv', 'conf/icpr', 'conf/icip'],
}

rank(subarea_seeds).head(20)

conf/cvpr : OK
journals/pami : OK
conf/iccv : OK
conf/icpr : OK
conf/icip : OK


Unnamed: 0_level_0,Vkey,VPsize,Citations,Qualis,H-Index,Unnamed: 6_level_0,Rank,Vkey,Qualis,P-score,Unnamed: 11_level_0,Rank,Vkey,Qualis,PS/VP
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,journals/nar,5753.0,803995.0,B1,164.0,,1,conf/icip,A1,3263.0,,1,conf/iccv,A1,0.451228
2,conf/balt,232.0,827.0,,132.0,,2,conf/cvpr,A1,2940.0,,2,conf/cvpr,A1,0.427699
3,journals/cce,2581.0,35527.0,,118.0,,3,conf/icpr,A1,1754.0,,3,journals/ijcv,A1,0.4
4,journals/cee,1272.0,4934.0,,118.0,,4,journals/pami,A1,1567.0,,4,conf/eccv,A1,0.367378
5,journals/pr,6744.0,201239.0,A1,118.0,,5,conf/iccv,A1,1360.0,,5,conf/emmcvpr,,0.310345
6,conf/cvpr,6874.0,379400.0,A1,118.0,,6,conf/eccv,A1,1223.0,,6,journals/pami,A1,0.30889
7,journals/cma,4572.0,33510.0,,118.0,,7,conf/icassp,A1,1143.0,,7,conf/icb,,0.28187
8,journals/neuroimage,6390.0,202819.0,,117.0,,8,journals/tip,A1,1070.0,,8,conf/fgr,,0.269763
9,conf/mmar,518.0,135.0,,113.0,,9,conf/icmcs,B3,887.0,,9,conf/bmvc,A2,0.255924
10,conf/icete,769.0,1194.0,B4,94.0,,10,journals/pr,A1,814.0,,10,conf/accv,,0.24234
