In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import sys
sys.path.append('../../arxiv_cs/')

%load_ext autoreload
%autoreload 2

In [3]:
import json
import pandas as pd
import gzip
from pathlib import Path
from collections import Counter
import datetime
import re

from utils import extract_date

In [4]:
DATA_CLEAN_PATH = Path("../../data/clean")
DATA_PROCESSED_PATH = Path("../../data/processed")
DATA_TEST_PATH = Path("../../data/test")

In [5]:
df = pd.read_json(DATA_PROCESSED_PATH / "arxiv_cs.json.gz")

df.head()

Unnamed: 0,id,authors,title,abstract,categories
0,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG
1,704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT
2,704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI
3,704.005,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI
4,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS


#### Create field with date
* Identifiers since 1 April 2007 (0704-) have year and month encoded in first 4 digits of id field.
* Identifiers up to March 2007 (9107-0703) have also category encoded in id field.
* See https://arxiv.org/help/arxiv_identifier

In [6]:
df['date'] = extract_date(df.id.to_list())

df.head()

Unnamed: 0,id,authors,title,abstract,categories,date
0,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,2007-04-01
1,704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT,2007-04-01
2,704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI,2007-04-01
3,704.005,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI,2007-04-01
4,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS,2007-04-01


In [7]:
df.date.min(), df.date.max()

(datetime.date(1993, 1, 1), datetime.date(2019, 11, 1))

#### Remove records from November

In [8]:
sum(df.date > datetime.date(2019, 10, 31))

947

In [9]:
nov_idxs = df[df.date > datetime.date(2019, 10, 31)].index

In [10]:
df.drop(nov_idxs, inplace=True)
sum(df.date > datetime.date(2019, 10, 31)), len(df)

(0, 240329)

In [11]:
df.head()

Unnamed: 0,id,authors,title,abstract,categories,date
0,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,2007-04-01
1,704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT,2007-04-01
2,704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI,2007-04-01
3,704.005,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI,2007-04-01
4,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS,2007-04-01


In [12]:
df.date.min(), df.date.max()

(datetime.date(1993, 1, 1), datetime.date(2019, 10, 1))

In [13]:
# remove date column and authors

df.drop(columns=['date', 'authors'], inplace=True)
df.head()

Unnamed: 0,id,title,abstract,categories
0,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG
1,704.0046,A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT
2,704.0047,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI
3,704.005,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI
4,704.0062,On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS


#### Let's take a look to duplicates

In [14]:
df.describe()

Unnamed: 0,id,title,abstract,categories
count,240329,240329,240329,240329
unique,240329,240020,240098,20507
top,cs/0407013,Discussion: Latent variable graphical model se...,This paper has been withdrawn.,cs.CV
freq,1,4,10,19512


In [15]:
df.describe().abstract

count                             240329
unique                            240098
top       This paper has been withdrawn.
freq                                  10
Name: abstract, dtype: object

#### It seems to be some duplicates, some of them have been withdrawn. Let's remove them.

In [16]:
# sum(df.abstract == "This paper has been withdrawn.")
sum(df.abstract.str.lower().str.contains("has been withdrawn"))

142

In [17]:
sum(df.abstract.duplicated())

231

In [18]:
df[df.abstract.duplicated()]

Unnamed: 0,id,title,abstract,categories
564,0706.2732,A Design Methodology for Space-Time Adapter,This paper presents a solution to efficiently ...,cs.AR
1256,0709.4303,Security Analyses of One-time System,This paper has been withdrawn,cs.CR
1258,0709.4420,Confirmation of Shannon's Mistake about Perfec...,This paper has been withdrawn,cs.CR
1996,0712.0392,Collaborative Gain in Resource Sharing Communi...,This paper has been withdrawn,cs.IT math.IT
2989,0803.1207,Serious Flaws in Korf et al.'s Analysis on Tim...,This paper has been withdrawn.,cs.AI
...,...,...,...,...
239666,cs/0703086,A Technical Report On Grid Benchmarking using ...,"Grids include heterogeneous resources, which a...",cs.PF
239683,cs/0703103,Concept of a Value in Multilevel Security Data...,This paper has been withdrawn.,cs.DB
239826,cs/9609102,Cue Phrase Classification Using Machine Learning,Cue phrases may be used in a discourse sense t...,cs.AI
239932,cs/9809061,New Applications of the Incompressibility Meth...,The incompressibility method is an elementary ...,cs.CC cs.DM


#### Removing duplicates from title and abstract

In [19]:
df.drop_duplicates(subset='title', keep='first', inplace=True)
df.describe()

Unnamed: 0,id,title,abstract,categories
count,240020,240020,240020,240020
unique,240020,240020,239859,20501
top,cs/0407013,Using Discriminative Methods to Learn Fashion ...,This paper has been withdrawn.,cs.CV
freq,1,1,10,19500


In [20]:
df.drop_duplicates(subset='abstract', keep='first', inplace=True)
df.describe()

Unnamed: 0,id,title,abstract,categories
count,239859,239859,239859,239859
unique,239859,239859,239859,20491
top,cs/0407013,Using Discriminative Methods to Learn Fashion ...,To relax power consumption requirements in mul...,cs.CV
freq,1,1,1,19492


In [21]:
df.head()

Unnamed: 0,id,title,abstract,categories
0,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG
1,704.0046,A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT
2,704.0047,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI
3,704.005,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI
4,704.0062,On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS


In [22]:
# 93MB
df.to_json(DATA_CLEAN_PATH / "arxiv_cs.json.gz", compression='gzip', orient='records')

In [23]:
df = pd.read_json(DATA_CLEAN_PATH / "arxiv_cs.json.gz")
df.head()

Unnamed: 0,id,title,abstract,categories
0,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG
1,704.0046,A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...",quant-ph cs.IT math.IT
2,704.0047,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is d...,cs.NE cs.AI
3,704.005,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emiss...,cs.NE cs.AI
4,704.0062,On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Viterb...",cs.DS


In [24]:
len(df)

239859

In [34]:
sample = df.sample(n=100, random_state=19)
sample.head()

Unnamed: 0,id,title,abstract,categories
143955,1801.00779,Machine Learning for Building Energy and Indoo...,Machine learning is a promising technique for ...,cs.CY cs.LG
216689,1908.02322,DpgMedia2019: A Dutch News Dataset for Partisa...,We present a new Dutch news dataset with label...,cs.CL
12202,1003.3767,Multi-Agent Simulation and Management Practices,Intelligent agents offer a new and exciting wa...,cs.AI cs.CE cs.MA
180354,1811.08039,Fenchel Lifted Networks: A Lagrange Relaxation...,Despite the recent successes of deep neural ne...,cs.LG stat.ML
131082,1708.02772,On Maximum Common Subgraph Problems in Series-...,The complexity of the maximum common connected...,cs.DS cs.CC


In [35]:
sample.to_csv(DATA_CLEAN_PATH / "sample.csv", index=False)

In [36]:
df.to_csv(DATA_TEST_PATH / "arxiv-cs.csv", index=False)

In [37]:
df.to_csv(DATA_TEST_PATH / "arxiv-cs.csv.gz", index=False, compression='gzip')