In [1]:
import os
import re

from fastparquet import write
import numpy as np
import pandas as pd
import snappy

In [2]:
!head -n 10 datasets/cit-HepTh.txt

# Directed graph (each unordered pair of nodes is saved once): Cit-HepTh.txt 
# Paper citation network of Arxiv High Energy Physics Theory category
# Nodes: 27770 Edges: 352807
# FromNodeId	ToNodeId
1001	9304045
1001	9308122
1001	9309097
1001	9311042
1001	9401139
1001	9404151


## Initialise variables

In [3]:
DATASETS_FOLDER = "datasets"
ABSTRACTS_FOLDER = "abstracts"

ABSTRACTS_FOLDER_PATH = f"{DATASETS_FOLDER}/{ABSTRACTS_FOLDER}/"

## Processing functions

In [4]:
def extract_text_from_abstract(text):
    info = {}
    fields = ['Date:', "From:", "Title:", "Authors:", "Comments:", "Subj-class:", "Journal-ref:"]
    
    for field in fields:
        match = re.search(f"[\n\r].*{field}\s*([^\n\r]*)", text, re.I)
        value = None
        if match is not None:
            value = match.group(1)
        
        info[field.replace(':', '').lower()] = value
        
    return info

ignore_emails = ['g@c']
ignore_tlds = ['g@c', '']
ignore_domains = ['c', '']

def domain_and_tld_from_email(email):
    domain = None
    tld = None
    email = email.lower()
    if '.' in email:
                        
        # Remove cases when email finishes with a period
        if email[-1] == '.':
            email = email[:-1]

        domain = email.split('@')[1].lower().strip()

        if '.ac.' in domain:
            domain = ".".join(domain.split('.')[-3:])
        else:
            domain = ".".join(domain.split('.')[-2:])

        tld  = email.split('.')[-1].lower().strip()

        if tld in ignore_tlds:
            tld = None

        if domain in ignore_domains:
            domain = None
        
    
    return domain, tld

# Code below from https://stackoverflow.com/a/40449726/9527459
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

## Process paper citations

In [5]:
with open(f"{DATASETS_FOLDER}/cit-HepTh.txt", 'r') as f:
    df = pd.read_csv(f,sep='\t',skiprows=(0,1,2))
    
# Rename columns
df.columns = ['FromNodeId', 'ToNodeId']

tlds_csv = pd.read_csv(f"{DATASETS_FOLDER}/tlds.csv", header=None, index_col=0, squeeze=True).to_dict()
tlds_info = tlds_csv[1]

In [6]:
df[100:200].head(10)

Unnamed: 0,FromNodeId,ToNodeId
100,9401139,9202046
101,9401139,9202059
102,9401139,9202092
103,9401139,9203008
104,9401139,9203031
105,9401139,9204035
106,9401139,9204037
107,9401139,9204040
108,9401139,9204046
109,9401139,9205046


In [7]:
# Paper that cites most papers
out_degree = df.groupby('FromNodeId').count().sort_values('ToNodeId', ascending = False)
out_degree.head()

Unnamed: 0_level_0,ToNodeId
FromNodeId,Unnamed: 1_level_1
9905111,562
9710046,359
110055,302
210157,289
101126,274


In [8]:
# Paper cited the most -> Most influential
in_degree = df.groupby('ToNodeId').count().sort_values('FromNodeId', ascending = False)
in_degree.head()

Unnamed: 0_level_0,FromNodeId
ToNodeId,Unnamed: 1_level_1
9711200,2414
9802150,1775
9802109,1641
9407087,1299
9610043,1199


In [9]:
degrees = pd.concat([in_degree, out_degree], axis=1, sort=False)
degrees.columns = ['out_degree', 'in_degree']

degrees.head()

Unnamed: 0,out_degree,in_degree
1001,10.0,83.0
1002,43.0,36.0
1003,15.0,1.0
1004,1.0,1.0
1005,1.0,21.0


# Process paper abstracts

In [10]:
abstracts_info = {}

for dir_name in os.listdir(f"{ABSTRACTS_FOLDER_PATH}"):
    try:
        year = int(dir_name)
        
        for f_name in os.listdir(f"{ABSTRACTS_FOLDER_PATH}/{year}"):
            with open(f"{ABSTRACTS_FOLDER_PATH}/{year}/{f_name}", 'r') as f:
                abstract = f.read()
                
                # Parts of the abstract
                abstract_parts = abstract.split('\\\\')
                
                paper_description = (abstract_parts[2] if len(abstract_parts) > 1 else "").strip()
                
                # Process emails
                emails_found = re.findall(r'[\w\.-]+@[\w\.-]+', abstract)
                
                emails = []
                
                for email in emails_found:
                    email = email.lower()
                    
                    if '.' in email:
                        # Remove cases when email finishes with a period
                        if email[-1] == '.':
                            email = email[:-1]
                        
                        emails.append(email)
                
                
                key = int(f_name.replace(".abs", ""))
                
                abstracts_info[key] = {
                    "emails": list(set(emails)),
#                     "tlds": list(set(tlds)),
#                     "domains": list(set(domains)),
                    "description": paper_description
                }
                
                abstracts_info[key].update(extract_text_from_abstract(abstract))
                
                
    except ValueError:
        pass 

In [11]:
papers = pd.DataFrame.from_dict(abstracts_info, orient='index')

papers = pd.concat([papers, degrees], axis=1, sort=False)

In [12]:
papers.head(30)

Unnamed: 0,emails,description,date,from,title,authors,comments,subj-class,journal-ref,out_degree,in_degree
1001,[psa@math.duke.edu],These are notes based on lectures given at TAS...,"Sat, 1 Jan 2000 00:02:31 GMT (84kb)",Paul S. Aspinwall <psa@math.duke.edu>,"Compactification, Geometry and Duality: N=2",Paul S. Aspinwall,"82 pages, 8 figures, LaTeX2e, TASI99, refs add...",,,10.0,83.0
1002,[pope@absinthe.physics.tamu.edu],We point out that massive gauged supergravity ...,"Mon, 3 Jan 2000 22:38:03 GMT (64kb)",Chris Pope <pope@absinthe.physics.tamu.edu>,Domain Walls and Massive Gauged Supergravity P...,"M. Cvetic, H. Lu and C.N. Pope","latex file, 11 pages, 3 figures",,Class.Quant.Grav. 17 (2000) 4867-4876,43.0,36.0
1003,[kang@physics.inje.ac.kr],"Recently, Ivanov and Volovich (hep-th/9912242)...","Sat, 1 Jan 2000 06:14:51 GMT (2kb)",KANG Gungwon <kang@physics.inje.ac.kr>,"Comment on ""Metric Fluctuations in Brane Worlds""",Y.S. Myung and Gungwon Kang,"4 pages, revtex",,,15.0,1.0
1004,[adam@godel.math.missouri.edu],"Quantum fields responding to ""moving mirrors"" ...","Sat, 1 Jan 2000 19:57:21 GMT (13kb)",Adam D. Helfer <adam@godel.math.missouri.edu>,Moving Mirrors and Thermodynamic Paradoxes,Adam D. Helfer,"7 pages, Revtex with Latex2e",,Phys.Rev. D63 (2001) 025016,1.0,1.0
1005,[jfuchs@mail.desy.de],Various aspects of spaces of chiral blocks are...,"Sun, 2 Jan 2000 17:06:40 GMT (24kb)",Juergen Fuchs <jfuchs@mail.desy.de>,Bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert","18 pages, LaTeX2e; slightly extended version o...",,,1.0,21.0
1006,[buchholz@theorie.physik.uni-goettingen.de],An assessment of the present status of the the...,"Mon, 3 Jan 2000 15:12:37 GMT (18kb)",Buchholz <buchholz@theorie.physik.uni-goetting...,Questions in quantum physics: a personal view,Rudolf Haag,15 pages,,,1.0,1.0
1007,[lisheng@itp.ac.cn],By making use of the complete decomposition of...,"Mon, 3 Jan 2000 15:41:16 GMT (8kb)",Sheng Li <lisheng@itp.ac.cn>,Topological Defects in 3-d Euclidean Gravity,"Sheng Li, Yong Zhang and Zhongyuan Zhu","9 pages, Revtex",High Energy Physics - Theory; Mathematical Phy...,,,4.0
1008,[spector@hws.edu],We study some of the algebraic properties of t...,"Mon, 3 Jan 2000 16:53:57 GMT (7kb)",Donald Spector <spector@HWS.EDU>,N=0 Supersymmetry and the Non-Relativistic Mon...,Donald Spector,"9 pages, harvmac, no figures",,Phys.Lett. B474 (2000) 331-335,2.0,6.0
1009,[nayak@th.physik.uni-frankfurt.de],We compute the probabilty for the processes A ...,"Mon, 3 Jan 2000 18:37:25 GMT (7kb)","""Gouranga C. Nayak"" <nayak@th.physik.uni-frank...",Gluon Pair Production From Space-Time Dependen...,Gouranga C. Nayak and Walter Greiner,"One figure added, revised version",,,1.0,1.0
1010,[belitsky@insti.physics.sunysb.edu],We discuss the reality properties of the fermi...,"Tue, 4 Jan 2000 01:28:58 GMT (10kb)",Andrei Belitsky <belitsky@insti.physics.sunysb...,"Instantons, Euclidean supersymmetry and Wick r...","A.V. Belitsky, S. Vandoren, P. van Nieuwenhuizen","8 pages, LaTeX, typos fixed",,Phys.Lett. B477 (2000) 335-340,12.0,5.0


In [13]:
more_than_one_email =  [True if len(e) == 0 else False for e in papers.emails]
papers[more_than_one_email]

Unnamed: 0,emails,description,date,from,title,authors,comments,subj-class,journal-ref,out_degree,in_degree
207008,[],Effective string field equations with zero-con...,"Mon, 1 Jul 2002 11:31:25 GMT (1kb)",Suayyip Salim Ozkurt <salim @mail.dumlupinar.e...,On the effects of the vanishing of the non-met...,Suayyip Salim Ozkurt,2 pages,,,,2.0
9201014,[],We generalize the known method for explicit co...,"Thu, 09 Jan 92 12:02:40 SET (17kb)",BERGLUN@CERNVM,A Generalized Construction of Mirror Manifolds,"P. Berglund and T. H\""ubsch",16 pages,,Nucl.Phys. B393 (1993) 377-391,26.0,
9203036,[],A method for quantizing the bidimensional N=2 ...,"Fri, 13 Mar 92 10:34 arg (10kb)",ALDAZABAL@arib51,On the quantization of the N=2 supersymmetric ...,"G.Aldazabal, J.M.Maldacena",16 pages,,Int.J.Mod.Phys. A8 (1993) 3359-3370,3.0,
9205104,[],We re-examine the geometry and algebraic struc...,"THU, 28 MAY 1992 20:40 EXP (10kb)",J.S.Park <PJESENAT@krysucc1>,"Universal Bundle, Generalized Russian Formula ...",Jae-Suk Park,"16 pages, harvmac TeX, ESENAT-92-07, (TeXnical...",,,,1.0
9210151,[],We introduce the covariant forms for the non-A...,"THU, 29 OCT 1992 17:18 EXP (9kb)",J.-S.Park <PJESENAT@KRYSUCC1>,"Zero-Modes, Covariant Anomaly Counterparts and...",Jae-Suk Park,"10 pages, ESENAT-92-08",,,1.0,
9304092,[],We consider a self-interacting scalar field th...,21 Apr 1993 11:24:39 +0000 (15kb),"""Guido Cognola, Dipartimento di Fisica, 38050 ...",Effective Lagrangian for self-interacting scal...,"Klaus Kirsten, Guido Cognola and Luciano Vanzo","14 pages, LaTex, UTF 293",,Phys.Rev. D48 (1993) 2813-2822,10.0,
9306091,[],We rewrite the action for $QCD_2$ in the light...,18 Jun 1993 17:21:52 +0200 (9kb),"""Igor Pesando ph +45-31424284-211;fax +45-3142...",The Master Field of QCD$_2$ and the 'T Hooft E...,"M. Cavicchi, P. Di Vecchia and I. Pesando","7 pages, Latex, NORDITA-93-41",,Mod.Phys.Lett. A8 (1993) 2427-2434; Erratum-ib...,10.0,1.0
9306160,[],We introduce the notion of ortho-symplectic su...,"Wed, 30 Jun 1993 10:12 EST (9kb)","""Judy Mack, University of Rochester, 716-275-4...",Super Triple Systems and Applications to Para-...,S. Okubo,"14 pages, Preprint No. UR-1312, ER-40685-762",,,,
9308106,[],Free and self-interacting scalar fields in the...,23 Aug 1993 14:38:16 +0000 (18kb),"""Guido Cognola, Dipartimento di Fisica, 38050 ...",Free and self-interacting scalar fields in the...,"Guido Cognola, Klaus Kirsten and Luciano Vanzo","20 Pages, RevTex, UTF300",,Phys.Rev. D49 (1994) 1029-1038,31.0,4.0
9310001,[],Using differential and integral calculi on the...,01 Oct 1993 11:23:22 +0300 (11kb),"""MASUD CHAICHIAN, TEL. 358-0-1918441; FAX 358-...",Q-Deformed Path Integral,M. Chaichian and A.P. Demichev,"14 pages, Latex, HU-SEFT R 1993-10",,Phys.Lett. B320 (1994) 273-280,2.0,2.0


In [14]:
papers.loc[9201014]

emails                                                        []
description    We generalize the known method for explicit co...
date                        Thu, 09 Jan 92 12:02:40 SET   (17kb)
from                                              BERGLUN@CERNVM
title             A Generalized Construction of Mirror Manifolds
authors                              P. Berglund and T. H\"ubsch
comments                                                16 pages
subj-class                                                  None
journal-ref                       Nucl.Phys. B393 (1993) 377-391
out_degree                                                    26
in_degree                                                    NaN
Name: 9201014, dtype: object

# Enrich Paper citations

In [15]:
citations = df.join(papers[['emails']], on='FromNodeId')
citations.columns = ['FromNodeId', 'ToNodeId', 'emails_from']

citations = citations.join(papers[['emails']], on='ToNodeId')
citations.columns = ['FromNodeId', 'ToNodeId', 'emails_from', 'emails_to']

explode_columns = ['emails_from', 'emails_to']

for ec in explode_columns:
    citations = explode(citations, [ec])

citations.drop_duplicates(inplace=True)
    
citations.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,FromNodeId,ToNodeId,emails_from,emails_to
0,1001,9304045,psa@math.duke.edu,derrick@bolvan.ph.utexas.edu
1,1001,9308122,psa@math.duke.edu,theisen@crnvma.cern.ch
2,1001,9309097,psa@math.duke.edu,aspin@guinness.ias.edu
3,1001,9311042,psa@math.duke.edu,dmrrsn@math.duke.edu
4,1001,9401139,psa@math.duke.edu,giveon@vms.huji.ac.il


In [16]:
citations["domain_from"], citations["tld_from"] = zip(*citations["emails_from"].map(domain_and_tld_from_email))
citations["domain_to"], citations["tld_to"] = zip(*citations["emails_to"].map(domain_and_tld_from_email))

citations.head()

Unnamed: 0,FromNodeId,ToNodeId,emails_from,emails_to,domain_from,tld_from,domain_to,tld_to
0,1001,9304045,psa@math.duke.edu,derrick@bolvan.ph.utexas.edu,duke.edu,edu,utexas.edu,edu
1,1001,9308122,psa@math.duke.edu,theisen@crnvma.cern.ch,duke.edu,edu,cern.ch,ch
2,1001,9309097,psa@math.duke.edu,aspin@guinness.ias.edu,duke.edu,edu,ias.edu,edu
3,1001,9311042,psa@math.duke.edu,dmrrsn@math.duke.edu,duke.edu,edu,duke.edu,edu
4,1001,9401139,psa@math.duke.edu,giveon@vms.huji.ac.il,duke.edu,edu,huji.ac.il,il


## TLD Aggregation

In [17]:
# pd.Series([item for sublist in papers.tlds for item in sublist])

# Flatten tlds
tld_series = pd.concat([citations.tld_from, citations.tld_to], axis=0) 

# Count different values
tld_df = tld_series.value_counts().sort_index().rename_axis('tld').reset_index(name='count')

# Add description column
tld_df['tlds_description'] = tld_df['tld'].map(lambda x: tlds_info[x] if x in tlds_info else None)

In [18]:
tld_df.sort_values('count', ascending=False).head(10)

Unnamed: 0,tld,count,tlds_description
25,edu,275167,Educational establishments
71,uk,58608,United Kingdom (United Kingdom of Great Britai...
44,jp,47076,Japan
22,de,39399,Germany (Federal Republic of)
12,ch,39214,Switzerland (Swiss Confederation)
42,it,38381,Italy (Italian Republic)
30,fr,24071,France (French Republic)
39,in,16554,India (Republic of)
46,kr,15245,Korea (Republic of) [South Korea]
28,es,14166,Spain (Kingdom of)


## Domain Aggregation _a.k.a. most influential institutions / labs_

In [19]:
#pd.Series([item for sublist in papers.domains for item in sublist])

# Flatten tlds
domain_series = pd.concat([citations.domain_from, citations.domain_to], axis=0) 

# Count different values
domain_df = domain_series.value_counts().sort_index().rename_axis('domain').reset_index(name='count')

# Sorting by count
domain_df.sort_values('count', ascending=False).head(10)

Unnamed: 0,domain,count
67,cern.ch,34504
245,ias.edu,28830
478,princeton.edu,27689
513,rutgers.edu,25897
219,harvard.edu,24112
674,ucsb.edu,23308
269,infn.it,22740
54,cam.ac.uk,17095
568,stanford.edu,14941
392,mit.edu,14903


## Save Dataframes

In [20]:
# Save dataframes
write(f"{DATASETS_FOLDER}/parquet/paper_citations.pq", citations)
write(f"{DATASETS_FOLDER}/parquet/papers.pq", papers)
write(f"{DATASETS_FOLDER}/parquet/tld_aggregation.pq", tld_df)
write(f"{DATASETS_FOLDER}/parquet/domain_aggregation.pq", domain_df)

  inferred_dtype = infer_dtype(column)
  inferred_dtype = infer_dtype(column)
