In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import sys
sys.path.append('../../arxiv_cs/')

%load_ext autoreload
%autoreload 2

In [3]:
import json
import pandas as pd
import gzip
from pathlib import Path
from collections import Counter
import datetime
import re

from utils import extract_date

In [4]:
DATA_CLEAN_PATH = Path("../../data/clean")
DATA_PROCESSED_PATH = Path("../../data/processed")

In [5]:
df = pd.read_json(DATA_PROCESSED_PATH / "arxiv_cs.json.gz")

df.head()

Unnamed: 0,id,authors,title,abstract,categories
0,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG
1,704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT
2,704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI
3,704.005,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI
4,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS


#### Add main category (the first one in categories field) to data frame

In [6]:
df['primary_cat'] = df.categories.map(lambda x: x.split()[0])

In [7]:
df.head()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat
0,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,math.CO
1,704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT,quant-ph
2,704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI,cs.NE
3,704.005,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI,cs.NE
4,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS,cs.DS


#### Create field with date
* Identifiers since 1 April 2007 (0704-) have year and month encoded in first 4 digits of id field.
* Identifiers up to March 2007 (9107-0703) have also category encoded in id field.
* See https://arxiv.org/help/arxiv_identifier

In [8]:
df['date'] = extract_date(df.id.to_list())

df.head()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat,date
0,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,math.CO,2007-04-01
1,704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT,quant-ph,2007-04-01
2,704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI,cs.NE,2007-04-01
3,704.005,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI,cs.NE,2007-04-01
4,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS,cs.DS,2007-04-01


In [9]:
df.date.min(), df.date.max()

(datetime.date(1993, 1, 1), datetime.date(2019, 11, 1))

In [10]:
df.tail()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat,date
241271,quant-ph/9909094,"E. Knill, R. Laflamme",Quantum Computation and Quadratically Signed W...,We prove that quantum computation is polynomia...,quant-ph cs.CC,quant-ph,1999-09-01
241272,quant-ph/9910033,"Edith Hemaspaandra (RIT), Lane A. Hemaspaandra...",Almost-Everywhere Superiority for Quantum Comp...,Simon as extended by Brassard and H{\o}yer sho...,quant-ph cs.CC,quant-ph,1999-10-01
241273,quant-ph/9910087,"Adrian Kent (DAMTP, University of Cambridge)",Unconditionally Secure Commitment of a Certifi...,In a secure bit commitment protocol involving ...,quant-ph cs.CR,quant-ph,1999-10-01
241274,quant-ph/9911043,Lucien Hardy (The Perimeter Institute) and Adr...,Cheat Sensitive Quantum Bit Commitment,We define cheat sensitive cryptographic protoc...,quant-ph cs.CR,quant-ph,1999-11-01
241275,quant-ph/9912100,Masanori Ohya and Igor V. Volovich,"Quantum Computing, NP-complete Problems and Ch...",An approach to the solution of NP-complete pro...,quant-ph chao-dyn cond-mat.mes-hall cs.CC nlin...,quant-ph,1999-12-01


#### Remove records from November

In [11]:
sum(df.date > datetime.date(2019, 10, 31))

947

In [12]:
nov_idxs = df[df.date > datetime.date(2019, 10, 31)].index

In [13]:
df.drop(nov_idxs, inplace=True)
sum(df.date > datetime.date(2019, 10, 31)), len(df)

(0, 240329)

In [14]:
df.describe()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat,date
count,240329.0,240329,240329,240329,240329,240329,240329
unique,240329.0,207388,240020,240100,20507,157,316
top,1811.06669,David Eppstein,Discussion: Latent variable graphical model se...,This paper has been withdrawn.,cs.CV,cs.CV,2019-10-01
freq,1.0,55,4,10,19512,27089,5450


In [15]:
df.describe().abstract

count                             240329
unique                            240100
top       This paper has been withdrawn.
freq                                  10
Name: abstract, dtype: object

#### It seems to be some duplicates, some of them have been withdrawn. Let's remove them.

In [16]:
# sum(df.abstract == "This paper has been withdrawn.")
sum(df.abstract.str.lower().str.contains("has been withdrawn"))

142

In [17]:
sum(df.abstract.duplicated())

229

In [18]:
df[df.abstract.duplicated()]

Unnamed: 0,id,authors,title,abstract,categories,primary_cat,date
564,0706.2732,"Cyrille Chavet (LESTER), Philippe Coussy (LEST...",A Design Methodology for Space-Time Adapter,This paper presents a solution to efficiently ...,cs.AR,cs.AR,2007-06-01
1256,0709.4303,Yong Wang,Security Analyses of One-time System,This paper has been withdrawn,cs.CR,cs.CR,2007-09-01
1258,0709.4420,Yong Wang,Confirmation of Shannon's Mistake about Perfec...,This paper has been withdrawn,cs.CR,cs.CR,2007-09-01
1996,0712.0392,"Saeed Akhavan-Astaneh, Saeed Gazor",Collaborative Gain in Resource Sharing Communi...,This paper has been withdrawn,cs.IT math.IT,cs.IT,2007-12-01
2989,0803.1207,Hang Dinh,Serious Flaws in Korf et al.'s Analysis on Tim...,This paper has been withdrawn.,cs.AI,cs.AI,2008-03-01
...,...,...,...,...,...,...,...
239666,cs/0703086,"John Kouvakis, Fotis Georgatos",A Technical Report On Grid Benchmarking using ...,"Grids include heterogeneous resources, which a...",cs.PF,cs.PF,2007-03-01
239683,cs/0703103,"Jia Tao, Shashi Gadia, Tsz Shing Cheng",Concept of a Value in Multilevel Security Data...,This paper has been withdrawn.,cs.DB,cs.DB,2007-03-01
239826,cs/9609102,D. J. Litman,Cue Phrase Classification Using Machine Learning,Cue phrases may be used in a discourse sense t...,cs.AI,cs.AI,1996-09-01
239932,cs/9809061,"Tao Jiang (McMaster U.), Ming Li (U of Waterlo...",New Applications of the Incompressibility Meth...,The incompressibility method is an elementary ...,cs.CC cs.DM,cs.CC,1998-09-01


In [19]:
df.drop_duplicates(subset='title', keep='first', inplace=True)
df.describe()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat,date
count,240020.0,240020,240020,240020,240020,240020,240020
unique,240020.0,207229,240020,239861,20501,157,316
top,1811.06669,David Eppstein,Local Multiple Directional Pattern of Palmprin...,This paper has been withdrawn.,cs.CV,cs.CV,2019-10-01
freq,1.0,55,1,10,19500,27070,5446


In [20]:
df.drop_duplicates(subset='abstract', keep='first', inplace=True)
df.describe()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat,date
count,239861.0,239861,239861,239861,239861,239861,239861
unique,239861.0,207142,239861,239861,20491,157,316
top,1811.06669,David Eppstein,Local Multiple Directional Pattern of Palmprin...,While standard cell layouts are drawn with min...,cs.CV,cs.CV,2019-10-01
freq,1.0,55,1,1,19492,27058,5446


#### sample of October papers

In [21]:
oct_df = df[df.date > datetime.date(2019, 9, 30)].copy()
oct_df.drop(columns='date', inplace=True)

oct_df.head()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat
225715,1910.00004,"Carl Yang, Yichen Feng, Pan Li, Yu Shi, Jiawei...",Meta-Graph Based HIN Spectral Embedding: Metho...,"In this work, we propose to study the utility ...",cs.LG cs.SI,cs.LG
225716,1910.00005,"Carl Yang, Jieyu Zhang, Jiawei Han",Neural Embedding Propagation on Heterogeneous ...,Classification is one of the most important pr...,cs.LG cs.SI,cs.LG
225717,1910.00019,Sho Yaida,Non-Gaussian processes and neural networks at ...,Gaussian processes are ubiquitous in nature an...,stat.ML cond-mat.dis-nn cs.LG hep-th,stat.ML
225718,1910.00024,"Shuo-Hui Li, Chen-Xiao Dong, Linfeng Zhang, an...",Neural Canonical Transformation with Symplecti...,Canonical transformation plays a fundamental r...,cond-mat.stat-mech cs.LG physics.comp-ph stat.ML,cond-mat.stat-mech
225719,1910.00032,"Ye Lyu, George Vosselman, Gui-Song Xia, Michae...",LIP: Learning Instance Propagation for Video O...,"In recent years, the task of segmenting foregr...",cs.CV,cs.CV


In [22]:
oct_df.to_json(DATA_CLEAN_PATH / "arxiv_cs_october19.json", orient='records')

In [23]:
df.drop(columns='date', inplace=True)
df.head()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat
0,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,math.CO
1,704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT,quant-ph
2,704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI,cs.NE
3,704.005,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI,cs.NE
4,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS,cs.DS


In [24]:
df.to_json(DATA_CLEAN_PATH / "arxiv_cs.json.gz", compression='gzip', orient='records')


#### TODO: normalize authors

In [25]:
df = pd.read_json(DATA_CLEAN_PATH / "arxiv_cs.json.gz")
df.head()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat
0,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,math.CO
1,704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT,quant-ph
2,704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI,cs.NE
3,704.005,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI,cs.NE
4,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS,cs.DS


In [26]:
len(df)

239861

In [28]:
df_oct = pd.read_json(DATA_CLEAN_PATH / "arxiv_cs_october19.json")
df_oct.head()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat
0,1910.00004,"Carl Yang, Yichen Feng, Pan Li, Yu Shi, Jiawei...",Meta-Graph Based HIN Spectral Embedding: Metho...,"In this work, we propose to study the utility ...",cs.LG cs.SI,cs.LG
1,1910.00005,"Carl Yang, Jieyu Zhang, Jiawei Han",Neural Embedding Propagation on Heterogeneous ...,Classification is one of the most important pr...,cs.LG cs.SI,cs.LG
2,1910.00019,Sho Yaida,Non-Gaussian processes and neural networks at ...,Gaussian processes are ubiquitous in nature an...,stat.ML cond-mat.dis-nn cs.LG hep-th,stat.ML
3,1910.00024,"Shuo-Hui Li, Chen-Xiao Dong, Linfeng Zhang, an...",Neural Canonical Transformation with Symplecti...,Canonical transformation plays a fundamental r...,cond-mat.stat-mech cs.LG physics.comp-ph stat.ML,cond-mat.stat-mech
4,1910.00032,"Ye Lyu, George Vosselman, Gui-Song Xia, Michae...",LIP: Learning Instance Propagation for Video O...,"In recent years, the task of segmenting foregr...",cs.CV,cs.CV


In [29]:
len(df_oct)

5446

In [36]:
df_oct.sample(20, random_state=19).to_json(DATA_CLEAN_PATH / "arxiv_cs_october19_sample_20.json", 
                                           orient='records')

In [37]:
with open(DATA_CLEAN_PATH / "arxiv_cs_october19_sample_20.json", "r") as f:
    ds = json.load(f)

In [38]:
ds

[{'id': 1910.00888,
  'authors': 'Thomas Pinetz, Daniel Soukup and Thomas Pock',
  'title': 'On the estimation of the Wasserstein distance in generative models',
  'abstract': 'Generative Adversarial Networks (GANs) have been used to model the underlying probability distribution of sample based datasets. GANs are notoriuos for training difficulties and their dependence on arbitrary hyperparameters. One recent improvement in GAN literature is to use the Wasserstein distance as loss function leading to Wasserstein Generative Adversarial Networks (WGANs). Using this as a basis, we show various ways in which the Wasserstein distance is estimated for the task of generative modelling. Additionally, the secrets in training such models are shown and summarized at the end of this work. Where applicable, we extend current works to different algorithms, different cost functions, and different regularization schemes to improve generative models.',
  'categories': 'cs.LG stat.ML',
  'primary_cat': 

In [33]:
ds[0]

{'id': 1910.00888,
 'authors': 'Thomas Pinetz, Daniel Soukup and Thomas Pock',
 'title': 'On the estimation of the Wasserstein distance in generative models',
 'abstract': 'Generative Adversarial Networks (GANs) have been used to model the underlying probability distribution of sample based datasets. GANs are notoriuos for training difficulties and their dependence on arbitrary hyperparameters. One recent improvement in GAN literature is to use the Wasserstein distance as loss function leading to Wasserstein Generative Adversarial Networks (WGANs). Using this as a basis, we show various ways in which the Wasserstein distance is estimated for the task of generative modelling. Additionally, the secrets in training such models are shown and summarized at the end of this work. Where applicable, we extend current works to different algorithms, different cost functions, and different regularization schemes to improve generative models.',
 'categories': 'cs.LG stat.ML',
 'primary_cat': 'cs.LG

In [39]:
df = pd.read_json(DATA_CLEAN_PATH / "arxiv_cs.json.gz")
df.head()

Unnamed: 0,id,authors,title,abstract,categories,primary_cat
0,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,math.CO
1,704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT,quant-ph
2,704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI,cs.NE
3,704.005,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI,cs.NE
4,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS,cs.DS
