In [3]:
!pip install cairocffi
import cairocffi
import pandas as pd
import dask
from dask import dataframe as dd
import igraph as ig
import numpy as np
from scipy import integrate

header = '/mnt/e/MAG/mag-2021-01-05/advanced/'
fields_affil = 'data_temp/paper_fields_of_study_byPaperID.txt'
fields_schema = 'FieldOfStudyChildren.txt'
fields_infos = 'FieldsOfStudy.txt'



In [3]:
fields_affil_dd = dd.read_csv(fields_affil, header=None, sep='\t', names=['paper_id', 'field_id', 'score'])
print(fields_affil_dd.head())

fields_schema_dd = pd.read_csv(header+fields_schema, header=None, sep='\t', names=['field_id', 'field_child_id'])
print(fields_schema_dd.head())

pairs = []
for idx,row in fields_schema_dd.iterrows():
    try:
        pairs.append((str(row['field_child_id']), str(row['field_id'])))
    except:
        print(row)
        
fields_infos_dd = pd.read_csv(header+fields_infos, header=None, sep='\t')[[0, 1, 2]]
fields_infos_dd.columns = ['field_id', 'rank', 'normalized_name']
print(fields_infos_dd.head())

   paper_id   field_id     score
0        15   15708023  0.341603
1        15   17744445  0.314704
2        15   86532276  0.389657
3        23  141071460  0.433938
4        23  177713679  0.433339
   field_id  field_child_id
0      4250       139676723
1     12843         8105449
2     12843        71383303
3     12843       105805003
4     12843       112019140
   field_id   rank          normalized_name
0    417682  16758                night air
1   1443462  15557              immobiliser
2   1576492  13013            matrix pencil
3   2657588  13530   combinatorial topology
4   3079626   8569  quantum electrodynamics


In [None]:
len(fields_affil_dd)

In [None]:
nodes = set()
for a,b in pairs:
    nodes.add(a)
    nodes.add(b)
    
fields_tree = ig.Graph(directed=True)
fields_tree.add_vertices(len(nodes))
fields_tree.vs['name'] = list(nodes)
fields_tree.add_edges(pairs)

In [None]:
idxs_out0 = [i for i, d in enumerate(fields_tree.degree(mode=ig.OUT)) if d == 0]
print(len(idxs_out0))
parents = [int(fields_tree.vs[i]['name']) for i in idxs_out0]
print(idxs_out0[:10])

In [None]:
# print parents ids
fields_infos_dd[fields_infos_dd['field_id'].isin(parents)].head(15)

In [None]:
def path_to_parent(g, v):
    idxs = g.neighbors(v, mode=ig.OUT)
    if len(idxs) == 0:
        return [g.vs[v]['name']]
    else:
        temp = []
        for i in idxs:
            temp += path_to_parent(g, i)
        return temp

def get_parents(g, v):
    temp = path_to_parent(g, v)
    parents_str = ','.join(temp)
    parents = [int(x) for x in temp]
    return parents, parents_str

# o parent é parent dele mesmo
print(fields_tree.vs[1077])
print(get_parents(fields_tree, 1077))

# output_file = open('data/fields_of_study.csv', 'w')
# for child in range(fields_tree.vcount()):
#     parents_ids, parents_id_str = get_parents(fields_tree, child)
#     parents_names = fields_infos_dd[fields_infos_dd[0].isin(parents_ids)][2].values
#     child_mag_id = fields_tree.vs[child]['name']
#     child_name = fields_infos_dd[fields_infos_dd[0] == int(child_mag_id)][2].values
#     output_file.write('%s\t%s\t%s\t%s\n' % (child_name[0], child_mag_id, ','.join(parents_names), parents_id_str))
    
# output_file.close()

In [None]:
# áreas que não estão na hierarquia
a = set([int(t) for t in fields_tree.vs['name']]) # quem está na hierarquia
b = set(fields_infos_dd['field_id'].values) # todas as áreas listadas no MAG
c = b-a
fields_infos_dd[fields_infos_dd['field_id'].isin(c)].head(10)

In [None]:
fields_of_study = pd.read_csv('data/fields_of_study.csv', sep='\t', header=None, names=['field', 'field_id', 'parents', 'parents_id'])
fields_of_study.head()

In [None]:
fields_affil_dd = fields_affil_dd.merge(fields_of_study, how='left', on='field_id')
fields_affil_dd.head()

In [None]:
# fields_affil_dd.to_csv('data_temp/FOS_split/fields_papers_*.csv', header=None)

In [10]:
import json

def div(dist):
    dist = np.asarray(dist)
    dist = dist/sum(dist)
    div = np.exp(-np.sum((dist*np.log(dist))))
    return np.rint(div)
    
def get_fos(row):
    if row['weights'] != None:
        w = json.loads(row['weights'])
        if len(w) <= 0:
            return ''
        to_sort = []
        for k,v in w.items():
            to_sort.append((v, k))
        fields = sorted(to_sort)
#         max_key = max(w, key=lambda k: w[k])
        n_fields = int(div(list(w.values())))
        output = []
        for f in fields[-n_fields:]:
            output.append(f[1])
        return ','.join(output)
    return ''

In [8]:
# d = {'15744967': 2.772012987012987, '71924100': 0.5198051948051948, '144024400': 0.42848484848484847, '17744445': 0.2816666666666667, '138885662': 0.8715151515151516, '2908647359': 0.03333333333333333, '127413603': 0.025, '142362112': 0.06818181818181819}
# d = {'121332964': 3.276230197909395, '127413603': 5.172747353593218, '86803240': 11.53172976529026, '15744967': 6.0951308819237004, '185592680': 2.100133654984465, '71924100': 23.08702061725509, '192562407': 3.0263360180152157, '41008148': 3.6679761904761903, '33923547': 0.5297222222222222, '138885662': 0.1, '142362112': 0.027777777777777776, '162324750': 0.0625, '144024400': 0.06523809523809523, '17744445': 0.20745722531436817, '205649164': 0.016666666666666666, '127313418': 0.03333333333333333}
d = {"15744967": 0.85, "71924100": 0.15} 	
get_fos({'weights':d})

2


'71924100,15744967'

In [11]:
import glob
# for year in [2020]:
year = 2020
print(glob.glob('data/AuthorsFOS_split/authors_fos_weights_%d_final_*' % year)[:10])

authors_fos = dd.read_csv('data/AuthorsFOS_split/authors_fos_weights_%d_final_*' % year, sep='\t', header=None, names=['author_id', 'weights'])
print(authors_fos.head())
#     valid_authors = pd.read_csv('data/valid_authors_full.txt', header=None, names=['author_id', 'papers', 'cits'])
#     valid_authors = valid_authors['author_id'].apply(int)
#     valid_authors = valid_authors.values
#     print(valid_authors[:10])

#     authors_fos = authors_fos[authors_fos['author_id'].isin(valid_authors)]
#     print(authors_fos.head())


['data/AuthorsFOS_split/authors_fos_weights_2020_final_00000', 'data/AuthorsFOS_split/authors_fos_weights_2020_final_00001', 'data/AuthorsFOS_split/authors_fos_weights_2020_final_00002', 'data/AuthorsFOS_split/authors_fos_weights_2020_final_00003', 'data/AuthorsFOS_split/authors_fos_weights_2020_final_00004', 'data/AuthorsFOS_split/authors_fos_weights_2020_final_00005', 'data/AuthorsFOS_split/authors_fos_weights_2020_final_00006', 'data/AuthorsFOS_split/authors_fos_weights_2020_final_00007', 'data/AuthorsFOS_split/authors_fos_weights_2020_final_00008', 'data/AuthorsFOS_split/authors_fos_weights_2020_final_00009']
   author_id                                            weights
0        584  {"15744967": 2.772012987012987, "71924100": 0....
1        859                                 {"121332964": 1.0}
2        978  {"138885662": 3.64297619047619, "142362112": 3...
3       1139  {"142362112": 1.5366666666666666, "138885662":...
4       1799  {"86803240": 0.49384615384615393, "71924100":

In [12]:
authors_fos['fos'] = authors_fos.apply(get_fos, axis=1, meta=('str'))
print(authors_fos.head())
# print(len(authors_fos), len(authors_fos['author_id'].unique()))
    
from dask.diagnostics import ProgressBar
with ProgressBar():
    authors_fos.to_csv('data/valid_authors_%d_fos_div_filter.csv' % year, sep='\t', header=None, index=None, single_file=True)

   author_id                                            weights  \
0        584  {"15744967": 2.772012987012987, "71924100": 0....   
1        859                                 {"121332964": 1.0}   
2        978  {"138885662": 3.64297619047619, "142362112": 3...   
3       1139  {"142362112": 1.5366666666666666, "138885662":...   
4       1799  {"86803240": 0.49384615384615393, "71924100": ...   

                                                 fos  
0              144024400,71924100,138885662,15744967  
1                                          121332964  
2  86803240,71924100,205649164,17744445,144024400...  
3  95457728,15744967,17744445,138885662,144024400...  
4                                  71924100,86803240  
[########################################] | 100% Completed |  3hr  8min 14.0s


In [None]:
import tqdm
fos = authors_fos_250k['fos']
X = []
for row in tqdm.tqdm(fos, total=len(valid_authors)):
    X.append(row)

In [None]:
import numpy as np
unique, count = np.unique(X, return_counts=True)
for u,c in zip(unique, count):
    print(u, c)

In [None]:
unique = [int(u) for u in unique]
print(fields_infos_dd.head())
unique_freq = fields_infos_dd[fields_infos_dd['field_id'].isin(unique)]
unique_freq['count'] = unique_freq.apply(lambda row: count[ unique.index(row['field_id']) ], axis=1)
print(unique_freq)

In [None]:
fields_infos_dd = pd.read_csv(header+fields_infos, header=None, sep='\t')[[0, 1, 2]]
fields_infos_dd.to_csv('data/fields_names.csv', sep='\t', header=None)