In [1]:
import numpy
import pandas

In [21]:
db_doc_fos = pandas.read_csv(
    "data/dblp_doc_kw.tab",
    sep        = '\t',
    names      = [ "id", "fos", "repr" ],
    dtype      = { "id": numpy.int64, "fos": str, "repr": numpy.float32 },
    low_memory = False
)
del db_doc_fos[ "repr" ]
db_doc_fos

Unnamed: 0,id,fos
0,1091,Telecommunications network
1,1091,Computer science
2,1091,Mind map
3,1091,Human–computer interaction
4,1091,Multimedia
...,...,...
45029846,9990887,Computer science
45029847,99956490,Discrete mathematics
45029848,99956490,Combinatorics
45029849,99956490,Entscheidungsproblem


In [8]:
db_auth = pandas.read_csv(
    "data/lia-auth.dat", 
    sep        = '\t', 
    names      = [ "id", "name" ],
    dtype      = { "id": numpy.int64, "name": str },
    low_memory = False
)
db_auth

Unnamed: 0,id,name
0,76672683,Nicolas Flavier
1,78369459,Laurence Candille
2,119492839,Christophe Servan
3,123804776,Mohamed Morchid
4,141838035,Mohammed Raiss-El-Fenni
...,...,...
85,2948377975,Alejandro Molina
86,2950895353,Ilaria Brunetti
87,2954435341,Eric SanJuan
88,2954744629,Rosa Figueiredo


In [10]:
db_fos = pandas.read_csv(
    "data/lia-fos.dat", 
    sep        = '\t', 
    names      = [ "fos" ],
    dtype      = { "fos": str },
    low_memory = False
)
db_fos

Unnamed: 0,fos
0,2-opt
1,Abstraction
2,Acceleration
3,Access control
4,Access method
...,...
1412,Workload
1413,World Wide Web
1414,Xcast
1415,Zigzag


In [64]:
db_df_lia      = db_doc_fos[ db_doc_fos.fos.isin( db_fos.fos ) ]
fos_lia_freqs  = db_df_lia.groupby(['fos']).fos.count().sort_values( ascending = False )
fos_min        = fos_lia_freqs.min()
fos_max        = fos_lia_freqs.max()
fos_med        = fos_lia_freqs.median()
fos_q1, fos_q3 = fos_lia_freqs.quantile([0.25, 0.75], interpolation='nearest')
fos_mean       = fos_lia_freqs.mean()
fos_std        = fos_lia_freqs.std()

In [65]:
print( "freqs:" )
print( fos_lia_freqs )
print( "\n" )
print( f"min : {fos_min}" )
print( f"max : {fos_max}" )
print( f"med : {fos_med}" )
print( f"q1  : {fos_q1}" )
print( f"q3  : {fos_q3}" )
print( f"mean: {fos_mean:.5f}" )
print( f"std : {fos_std:.5f}" )

freqs:
fos
Computer science           3318304
Artificial intelligence    1175949
Mathematics                 893778
Machine learning            426825
Computer vision             410559
                            ...   
Cell Mobility                   11
Conditioning process             9
GRBAS scale                      8
Cone of light                    3
Neology                          2
Name: fos, Length: 1412, dtype: int64
---
min : 2
max : 3318304
med : 2803.5
mean: 16016.78612
std : 103258.57139


In [77]:
fos_lia_filtered = fos_lia_freqs[ fos_lia_freqs.values > fos_mean ]
fos_lia_filtered = fos_lia_filtered[ fos_lia_filtered.values < fos_mean + fos_std ]
fos_lia_filtered

fos
Artificial neural network     118196
Simulation                    116470
Mathematical analysis         115153
Speech recognition            115010
The Internet                  107460
                               ...  
Normalization (statistics)     16264
A priori and a posteriori      16197
Annotation                     16171
Decision tree                  16077
Integer programming            16049
Name: fos, Length: 183, dtype: int64

In [78]:
fos_lia_freqs["Artificial intelligence"]
db_df_lia[db_df_lia.fos.isin(fos_lia_filtered.keys()) ]

Unnamed: 0,id,fos
8,1388,Graph
17,1674,Visualization
22,1688,Support vector machine
27,1688,Artificial neural network
40,6522,Authentication
...,...,...
45029804,998679997,Binary number
45029825,99884553,Signal processing
45029829,99884553,Communication channel
45029832,99884553,Distortion
