In [1]:
import os
import re

from bokeh.io import output_file, output_notebook
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn
from bokeh.models import ColumnDataSource
from bokeh.plotting import show
from bokeh.plotting import figure
from fastparquet import write
import numpy as np
import pandas as pd
import snappy

output_notebook()

In [2]:
!head -n 10 datasets/cit-HepTh.txt

# Directed graph (each unordered pair of nodes is saved once): Cit-HepTh.txt 
# Paper citation network of Arxiv High Energy Physics Theory category
# Nodes: 27770 Edges: 352807
# FromNodeId	ToNodeId
1001	9304045
1001	9308122
1001	9309097
1001	9311042
1001	9401139
1001	9404151


## Initialise variables

In [3]:
DATASETS_FOLDER = "datasets"
ABSTRACTS_FOLDER = "abstracts"

ABSTRACTS_FOLDER_PATH = f"{DATASETS_FOLDER}/{ABSTRACTS_FOLDER}/"

## Processing functions

In [4]:
def extract_text_from_abstract(text):
    info = {}
    fields = ['Date:', "From:", "Title:", "Authors:", "Comments:", "Subj-class:", "Journal-ref:"]
    
    for field in fields:
        match = re.search(f"[\n\r].*{field}\s*([^\n\r]*)", text, re.I)
        value = None
        if match is not None:
            value = match.group(1)
        
        info[field.replace(':', '').lower()] = value
        
    return info

ignore_emails = ['g@c']
ignore_tlds = ['g@c', '']
ignore_domains = ['c', '']

def domain_and_tld_from_email(email):
    domain = None
    tld = None
    email = email.lower()
    if '.' in email:
                        
        # Remove cases when email finishes with a period
        if email[-1] == '.':
            email = email[:-1]

        domain = email.split('@')[1].lower().strip()

        if '.ac.' in domain or '.co.' in domain or '.edu.' in domain or '.gov.' in domain or '.com.' in domain:
            domain = ".".join(domain.split('.')[-3:])
        else:
            domain = ".".join(domain.split('.')[-2:])

        tld  = email.split('.')[-1].lower().strip()

        if tld in ignore_tlds:
            tld = None

        if domain in ignore_domains:
            domain = None
        
    
    return domain, tld

# Code below from https://stackoverflow.com/a/40449726/9527459
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [11]:
def display_df_with_bokeh(df, columns=None, range_of_records=slice(0,20), include_index=False):
    
    table_columns = []
    
    if columns is None:
        columns = {column: column for column in df}
    
    if range_of_records is not None:
        df = df[range_of_records]
    
    if include_index:
        table_columns.append(TableColumn(field=df.index.name, title=df.index.name))
    
    for field, title in columns.items():
        table_columns.append(TableColumn(field=field, title=title))
    
    
    data_table = DataTable(columns=table_columns, source=ColumnDataSource(df)) # bokeh table

    show(data_table)

## Process paper citations

In [6]:
with open(f"{DATASETS_FOLDER}/cit-HepTh.txt", 'r') as f:
    df = pd.read_csv(f,sep='\t',skiprows=(0,1,2))
    
# Rename columns
df.columns = ['FromNodeId', 'ToNodeId']

tlds_csv = pd.read_csv(f"{DATASETS_FOLDER}/tlds.csv", header=None, index_col=0, squeeze=True).to_dict()
tlds_info = tlds_csv[1]

In [7]:
display_df_with_bokeh(df)

In [8]:
# Paper that cites most papers
out_degree = df.groupby('FromNodeId').count().sort_values('ToNodeId', ascending = False)

display_df_with_bokeh(out_degree, columns={
    "FromNodeId": "Paper",
    "ToNodeId": "Papers cited"
})

In [9]:
hist, edges = np.histogram(out_degree['ToNodeId'], bins=100, range = [0, 600])

# Create the blank plot
p = figure(plot_height = 500, plot_width = 900, 
           title = 'Citations histogram',
          x_axis_label = 'Papers cited', 
           y_axis_label = 'Papers')

# Add a quad glyph
p.quad(bottom=0, top=hist, 
       left=edges[:-1], right=edges[1:],
       fill_color= 'navy', line_color='white')

# Show the plot
show(p)

In [14]:
# Paper cited the most -> Most influential
in_degree = df.groupby('ToNodeId').count().sort_values('FromNodeId', ascending = False)

display_df_with_bokeh(in_degree, columns={
    "ToNodeId": "Paper",
    "FromNodeId": "Paper citations"
})

In [15]:
hist, edges = np.histogram(in_degree['FromNodeId'], bins=100)

# Create the blank plot
p = figure(plot_height = 500, plot_width = 900, 
           title = 'Citations histogram (in_degree)',
          x_axis_label = 'Paper citations', 
           y_axis_label = 'Papers')

# Add a quad glyph
p.quad(bottom=0, top=hist, 
       left=edges[:-1], right=edges[1:],
       fill_color= 'navy', line_color='white')

# Show the plot
show(p)

In [16]:
degrees = pd.concat([in_degree, out_degree], axis=1, sort=False)
degrees.columns = ['out_degree', 'in_degree']
degrees.index.name = 'paper'

display_df_with_bokeh(degrees, include_index=True)

# Process paper abstracts

In [17]:
abstracts_info = {}

for dir_name in os.listdir(f"{ABSTRACTS_FOLDER_PATH}"):
    try:
        year = int(dir_name)
        
        for f_name in os.listdir(f"{ABSTRACTS_FOLDER_PATH}/{year}"):
            with open(f"{ABSTRACTS_FOLDER_PATH}/{year}/{f_name}", 'r') as f:
                abstract = f.read()
                
                # Parts of the abstract
                abstract_parts = abstract.split('\\\\')
                
                paper_description = (abstract_parts[2] if len(abstract_parts) > 1 else "").strip()
                
                # Process emails
                emails_found = re.findall(r'[\w\.-]+@[\w\.-]+', abstract)
                
                emails = []
                
                for email in emails_found:
                    email = email.lower()
                    
                    if '.' in email:
                        # Remove cases when email finishes with a period
                        if email[-1] == '.':
                            email = email[:-1]
                        
                        emails.append(email)
                
                
                key = int(f_name.replace(".abs", ""))
                
                abstracts_info[key] = {
                    "emails": list(set(emails)),
#                     "tlds": list(set(tlds)),
#                     "domains": list(set(domains)),
                    "description": paper_description
                }
                
                abstracts_info[key].update(extract_text_from_abstract(abstract))
                
                
    except ValueError:
        pass 

In [18]:
papers = pd.DataFrame.from_dict(abstracts_info, orient='index')

papers = pd.concat([papers, degrees], axis=1, sort=False)

In [19]:
papers.head()

Unnamed: 0,emails,description,date,from,title,authors,comments,subj-class,journal-ref,out_degree,in_degree
1001,[psa@math.duke.edu],These are notes based on lectures given at TAS...,"Sat, 1 Jan 2000 00:02:31 GMT (84kb)",Paul S. Aspinwall <psa@math.duke.edu>,"Compactification, Geometry and Duality: N=2",Paul S. Aspinwall,"82 pages, 8 figures, LaTeX2e, TASI99, refs add...",,,10.0,83.0
1002,[pope@absinthe.physics.tamu.edu],We point out that massive gauged supergravity ...,"Mon, 3 Jan 2000 22:38:03 GMT (64kb)",Chris Pope <pope@absinthe.physics.tamu.edu>,Domain Walls and Massive Gauged Supergravity P...,"M. Cvetic, H. Lu and C.N. Pope","latex file, 11 pages, 3 figures",,Class.Quant.Grav. 17 (2000) 4867-4876,43.0,36.0
1003,[kang@physics.inje.ac.kr],"Recently, Ivanov and Volovich (hep-th/9912242)...","Sat, 1 Jan 2000 06:14:51 GMT (2kb)",KANG Gungwon <kang@physics.inje.ac.kr>,"Comment on ""Metric Fluctuations in Brane Worlds""",Y.S. Myung and Gungwon Kang,"4 pages, revtex",,,15.0,1.0
1004,[adam@godel.math.missouri.edu],"Quantum fields responding to ""moving mirrors"" ...","Sat, 1 Jan 2000 19:57:21 GMT (13kb)",Adam D. Helfer <adam@godel.math.missouri.edu>,Moving Mirrors and Thermodynamic Paradoxes,Adam D. Helfer,"7 pages, Revtex with Latex2e",,Phys.Rev. D63 (2001) 025016,1.0,1.0
1005,[jfuchs@mail.desy.de],Various aspects of spaces of chiral blocks are...,"Sun, 2 Jan 2000 17:06:40 GMT (24kb)",Juergen Fuchs <jfuchs@mail.desy.de>,Bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert","18 pages, LaTeX2e; slightly extended version o...",,,1.0,21.0


In [20]:
more_than_one_email =  [True if len(e) == 0 else False for e in papers.emails]
papers[more_than_one_email]

Unnamed: 0,emails,description,date,from,title,authors,comments,subj-class,journal-ref,out_degree,in_degree
207008,[],Effective string field equations with zero-con...,"Mon, 1 Jul 2002 11:31:25 GMT (1kb)",Suayyip Salim Ozkurt <salim @mail.dumlupinar.e...,On the effects of the vanishing of the non-met...,Suayyip Salim Ozkurt,2 pages,,,,2.0
9201014,[],We generalize the known method for explicit co...,"Thu, 09 Jan 92 12:02:40 SET (17kb)",BERGLUN@CERNVM,A Generalized Construction of Mirror Manifolds,"P. Berglund and T. H\""ubsch",16 pages,,Nucl.Phys. B393 (1993) 377-391,26.0,
9203036,[],A method for quantizing the bidimensional N=2 ...,"Fri, 13 Mar 92 10:34 arg (10kb)",ALDAZABAL@arib51,On the quantization of the N=2 supersymmetric ...,"G.Aldazabal, J.M.Maldacena",16 pages,,Int.J.Mod.Phys. A8 (1993) 3359-3370,3.0,
9205104,[],We re-examine the geometry and algebraic struc...,"THU, 28 MAY 1992 20:40 EXP (10kb)",J.S.Park <PJESENAT@krysucc1>,"Universal Bundle, Generalized Russian Formula ...",Jae-Suk Park,"16 pages, harvmac TeX, ESENAT-92-07, (TeXnical...",,,,1.0
9210151,[],We introduce the covariant forms for the non-A...,"THU, 29 OCT 1992 17:18 EXP (9kb)",J.-S.Park <PJESENAT@KRYSUCC1>,"Zero-Modes, Covariant Anomaly Counterparts and...",Jae-Suk Park,"10 pages, ESENAT-92-08",,,1.0,
9304092,[],We consider a self-interacting scalar field th...,21 Apr 1993 11:24:39 +0000 (15kb),"""Guido Cognola, Dipartimento di Fisica, 38050 ...",Effective Lagrangian for self-interacting scal...,"Klaus Kirsten, Guido Cognola and Luciano Vanzo","14 pages, LaTex, UTF 293",,Phys.Rev. D48 (1993) 2813-2822,10.0,
9306091,[],We rewrite the action for $QCD_2$ in the light...,18 Jun 1993 17:21:52 +0200 (9kb),"""Igor Pesando ph +45-31424284-211;fax +45-3142...",The Master Field of QCD$_2$ and the 'T Hooft E...,"M. Cavicchi, P. Di Vecchia and I. Pesando","7 pages, Latex, NORDITA-93-41",,Mod.Phys.Lett. A8 (1993) 2427-2434; Erratum-ib...,10.0,1.0
9306160,[],We introduce the notion of ortho-symplectic su...,"Wed, 30 Jun 1993 10:12 EST (9kb)","""Judy Mack, University of Rochester, 716-275-4...",Super Triple Systems and Applications to Para-...,S. Okubo,"14 pages, Preprint No. UR-1312, ER-40685-762",,,,
9308106,[],Free and self-interacting scalar fields in the...,23 Aug 1993 14:38:16 +0000 (18kb),"""Guido Cognola, Dipartimento di Fisica, 38050 ...",Free and self-interacting scalar fields in the...,"Guido Cognola, Klaus Kirsten and Luciano Vanzo","20 Pages, RevTex, UTF300",,Phys.Rev. D49 (1994) 1029-1038,31.0,4.0
9310001,[],Using differential and integral calculi on the...,01 Oct 1993 11:23:22 +0300 (11kb),"""MASUD CHAICHIAN, TEL. 358-0-1918441; FAX 358-...",Q-Deformed Path Integral,M. Chaichian and A.P. Demichev,"14 pages, Latex, HU-SEFT R 1993-10",,Phys.Lett. B320 (1994) 273-280,2.0,2.0


# Enrich Paper citations

In [21]:
citations = df.join(papers[['emails']], on='FromNodeId')
citations.columns = ['FromNodeId', 'ToNodeId', 'emails_from']

citations = citations.join(papers[['emails']], on='ToNodeId')
citations.columns = ['FromNodeId', 'ToNodeId', 'emails_from', 'emails_to']

explode_columns = ['emails_from', 'emails_to']

for ec in explode_columns:
    citations = explode(citations, [ec])

citations.drop_duplicates(inplace=True)
    
display_df_with_bokeh(citations.head(20))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [22]:
citations["domain_from"], citations["tld_from"] = zip(*citations["emails_from"].map(domain_and_tld_from_email))
citations["domain_to"], citations["tld_to"] = zip(*citations["emails_to"].map(domain_and_tld_from_email))

display_df_with_bokeh(citations)

## TLD Aggregation

In [23]:
# pd.Series([item for sublist in papers.tlds for item in sublist])

# Flatten tlds
tld_series = pd.concat([citations.tld_from, citations.tld_to], axis=0) 

# Count different values
tld_df = tld_series.value_counts().sort_index().rename_axis('tld').reset_index(name='count')

# Add description column
tld_df['tlds_description'] = tld_df['tld'].map(lambda x: tlds_info[x] if x in tlds_info else None)

In [24]:
display_df_with_bokeh(tld_df.sort_values('count', ascending=False))

## Domain Aggregation _a.k.a. most influential institutions / labs_

In [25]:
#pd.Series([item for sublist in papers.domains for item in sublist])

# Flatten tlds
domain_series = pd.concat([citations.domain_from, citations.domain_to], axis=0) 

# Count different values
domain_df = domain_series.value_counts().sort_index().rename_axis('domain').reset_index(name='count')

# Sorting by count
display_df_with_bokeh(domain_df.sort_values('count', ascending=False))

## Institutions per country
![alt text](visualisation/screenshots/institutions_per_country.jpeg "Title")

## Save Dataframes

In [None]:
# Save dataframes
write(f"{DATASETS_FOLDER}/parquet/paper_citations.pq", citations)
write(f"{DATASETS_FOLDER}/parquet/papers.pq", papers)
write(f"{DATASETS_FOLDER}/parquet/tld_aggregation.pq", tld_df)
write(f"{DATASETS_FOLDER}/parquet/domain_aggregation.pq", domain_df)