In [1]:
!pip install cairocffi
import cairocffi
import pandas as pd
import dask
from dask import dataframe as dd
import igraph as ig
import numpy as np

header = '/mnt/e/MAG/mag-2021-01-05/advanced/'
fields_affil = 'data_temp/paper_fields_of_study_byPaperID.txt'
fields_schema = 'FieldOfStudyChildren.txt'
fields_infos = 'FieldsOfStudy.txt'



In [2]:
fields_affil_dd = dd.read_csv(fields_affil, header=None, sep='\t', names=['paper_id', 'field_id', 'score'])
print(fields_affil_dd.head())

fields_schema_dd = pd.read_csv(header+fields_schema, header=None, sep='\t', names=['field_id', 'field_child_id'])
print(fields_schema_dd.head())

pairs = []
for idx,row in fields_schema_dd.iterrows():
    try:
        pairs.append((str(row['field_child_id']), str(row['field_id'])))
    except:
        print(row)
        
fields_infos_dd = pd.read_csv(header+fields_infos, header=None, sep='\t')[[0, 1, 2]]
fields_infos_dd.columns = ['field_id', 'rank', 'normalized_name']
print(fields_infos_dd.head())

   paper_id   field_id     score
0        15   15708023  0.341603
1        15   17744445  0.314704
2        15   86532276  0.389657
3        23  141071460  0.433938
4        23  177713679  0.433339
   field_id  field_child_id
0      4250       139676723
1     12843         8105449
2     12843        71383303
3     12843       105805003
4     12843       112019140
   field_id   rank          normalized_name
0    417682  16758                night air
1   1443462  15557              immobiliser
2   1576492  13013            matrix pencil
3   2657588  13530   combinatorial topology
4   3079626   8569  quantum electrodynamics


In [5]:
len(fields_affil_dd)

1444277071

In [3]:
nodes = set()
for a,b in pairs:
    nodes.add(a)
    nodes.add(b)
    
fields_tree = ig.Graph(directed=True)
fields_tree.add_vertices(len(nodes))
fields_tree.vs['name'] = list(nodes)
fields_tree.add_edges(pairs)

In [4]:
idxs_out0 = [i for i, d in enumerate(fields_tree.degree(mode=ig.OUT)) if d == 0]
print(len(idxs_out0))
parents = [int(fields_tree.vs[i]['name']) for i in idxs_out0]
print(idxs_out0[:10])

205
[652, 849, 1063, 1274, 4938, 5875, 9359, 9671, 11378, 23757]


In [6]:
# print parents ids
fields_infos_dd[fields_infos_dd['field_id'].isin(parents)].head(15)

Unnamed: 0,field_id,rank,normalized_name
9084,154821213,11471,rationalization
12158,2775930363,16876,desmarestia viridis
15380,2778037673,11888,atlanta
18012,95457728,6708,history
22470,2909617936,14840,strix aluco
23915,127313418,5790,geology
30650,2781130464,11839,rana
38553,2988118254,12551,rating system
38628,2993098308,13762,black sea region
39488,2779376305,14808,codium fragile


In [7]:
def path_to_parent(g, v):
    idxs = g.neighbors(v, mode=ig.OUT)
    if len(idxs) == 0:
        return [g.vs[v]['name']]
    else:
        temp = []
        for i in idxs:
            temp += path_to_parent(g, i)
        return temp

def get_parents(g, v):
    temp = path_to_parent(g, v)
    parents_str = ','.join(temp)
    parents = [int(x) for x in temp]
    return parents, parents_str

# o parent é parent dele mesmo
print(fields_tree.vs[1077])
print(get_parents(fields_tree, 1077))

# output_file = open('data/fields_of_study.csv', 'w')
# for child in range(fields_tree.vcount()):
#     parents_ids, parents_id_str = get_parents(fields_tree, child)
#     parents_names = fields_infos_dd[fields_infos_dd[0].isin(parents_ids)][2].values
#     child_mag_id = fields_tree.vs[child]['name']
#     child_name = fields_infos_dd[fields_infos_dd[0] == int(child_mag_id)][2].values
#     output_file.write('%s\t%s\t%s\t%s\n' % (child_name[0], child_mag_id, ','.join(parents_names), parents_id_str))
    
# output_file.close()

igraph.Vertex(<igraph.Graph object at 0x7f93e0a3e040>, 1077, {'name': '2992112591'})
([162324750, 162324750, 144133560, 162324750, 144133560, 162324750, 144133560, 162324750, 144133560, 162324750, 144133560, 162324750, 144133560, 162324750, 162324750], '162324750,162324750,144133560,162324750,144133560,162324750,144133560,162324750,144133560,162324750,144133560,162324750,144133560,162324750,162324750')


In [8]:
# áreas que não estão na hierarquia
a = set([int(t) for t in fields_tree.vs['name']]) # quem está na hierarquia
b = set(fields_infos_dd['field_id'].values) # todas as áreas listadas no MAG
c = b-a
fields_infos_dd[fields_infos_dd['field_id'].isin(c)].head(10)

Unnamed: 0,field_id,rank,normalized_name
201,2775890984,21077,nesiostrymon
214,2776023202,20757,pacific elaenia
220,2776056147,22103,coelorinchus sheni
234,2776240375,21866,chicomurex turschi
253,2776450095,21505,dimetofrine
254,2776454293,21189,macropodus hongkongensis
258,2776471341,20470,cassipourea flanaganii
264,2776516433,21343,haplotrema vancouverense
278,2776615586,19973,pillotina calotermitidis
285,2776663788,20981,pachypodium saundersii


In [9]:
fields_of_study = pd.read_csv('data/fields_of_study.csv', sep='\t', header=None, names=['field', 'field_id', 'parents', 'parents_id'])
fields_of_study.head()

Unnamed: 0,field,field_id,parents,parents_id
0,susceptibility gene,3019216649,"chemistry,biology",868032401855926808680324086803240
1,chondroitin sulfate proteoglycans,2911068277,"chemistry,biology",8680324018559268086803240
2,signal width,2910870160,political science,17744445
3,temporin b,2910597275,"chemistry,biology,medicine","86803240,185592680,86803240,86803240,86803240,..."
4,sceloporus mucronatus,2910688544,"chemistry,biology,medicine","86803240,71924100,86803240,86803240,86803240,1..."


In [10]:
fields_affil_dd = fields_affil_dd.merge(fields_of_study, how='left', on='field_id')
fields_affil_dd.head()

Unnamed: 0,paper_id,field_id,score,field,parents,parents_id
0,15,15708023,0.341603,humanities,"philosophy,art",138885662142362112
1,15,17744445,0.314704,political science,political science,17744445
2,15,86532276,0.389657,delegation,"economics,philosophy,political science,art",16232475013888566214236211217744445
3,23,141071460,0.433938,surgery,medicine,71924100
4,23,177713679,0.433339,intensive care medicine,medicine,71924100


In [None]:
# fields_affil_dd.to_csv('data_temp/FOS_split/fields_papers_*.csv', header=None)

In [6]:
import json
def top3_fos(row):
    if row['weights'] != None:
        w = json.loads(row['weights'])
        if len(w) <= 0:
            return ''
        max_key = max(w, key=lambda k: w[k])
        
        return max_key
    return ''
    
# 1960
authors_fos = dd.read_csv('data/AuthorsFOS_split/authors_weights_year_1960*', sep='\t', header=None, names=['author_id', 'weights'])
valid_authors = pd.read_csv('data/valid_authors_full.txt', header=None, names=['author_id', 'papers', 'cits'])
valid_authors = valid_authors['author_id'].apply(int)
valid_authors = valid_authors.values
print(valid_authors[:10])

authors_fos_250k = authors_fos[authors_fos['author_id'].isin(valid_authors)]
print(authors_fos_250k.head())
authors_fos_250k['fos'] = authors_fos_250k.apply(top3_fos, axis=1, meta=('str'))
print(authors_fos_250k.head())

[  36534  502104 1382709 4534694 5113799 5478707 5975687 6583431 8051503
 8650941]
     author_id                                            weights
20  2107573329  {"185592680": 0.39999999999999997, "86803240":...
23  2660115790  {"185592680": 2.0161895743145744, "86803240": ...
52     8051503  {"86803240": 0.5416666666666666, "185592680": ...
97  2421386901           {"71924100": 0.8125, "86803240": 0.1875}
98  2580168249           {"71924100": 0.8125, "86803240": 0.1875}
     author_id                                            weights        fos
20  2107573329  {"185592680": 0.39999999999999997, "86803240":...  185592680
23  2660115790  {"185592680": 2.0161895743145744, "86803240": ...   86803240
52     8051503  {"86803240": 0.5416666666666666, "185592680": ...   86803240
97  2421386901           {"71924100": 0.8125, "86803240": 0.1875}   71924100
98  2580168249           {"71924100": 0.8125, "86803240": 0.1875}   71924100


In [16]:
i

243839

In [None]:
import tqdm
fos = authors_fos_250k['fos']
X = []
for row in tqdm.tqdm(fos, total=len(valid_authors)):
    X.append(row)

In [None]:
import numpy as np
unique, count = np.unique(X, return_counts=True)
for u,c in zip(unique, count):
    print(u, c)

In [None]:
unique = [int(u) for u in unique]
print(fields_infos_dd.head())
unique_freq = fields_infos_dd[fields_infos_dd['field_id'].isin(unique)]
unique_freq['count'] = unique_freq.apply(lambda row: count[ unique.index(row['field_id']) ], axis=1)
print(unique_freq)

In [7]:
from dask.diagnostics import ProgressBar
with ProgressBar():
    authors_fos_250k.to_csv('data/valid_authors_1960_fos_full.csv', sep='\t', header=None, index=None, single_file=True)

[########################################] | 100% Completed | 44min 56.3s
