In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json


**DATA LOADING**

**Installing essential libraries to load the ARXIV Kaggle dataset**

In [5]:
import numpy as np
import pandas as pd
import json
from collections import Counter

**Converting JSON file to CSV file**

In [6]:
#Get a list of dicts and convert into a pandas dataframe
arxiv_data = []
for line in open('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json', 'r'):
    arxiv_data.append(json.loads(line))
df = pd.DataFrame.from_records(arxiv_data)

In [7]:
#Filter rows as required and save the resulting df for later use. Here I'm extracting records with arxiv papers having cs.CL as one of the categories.
drop_idx = []
for index, row in df.iterrows():
    if 'cs.CL' not in row['categories']:
        drop_idx.append(index)
df = df.drop(drop_idx).reset_index(drop=True)
df.to_csv('arxiv_cs-CL.csv', index=False)

In [8]:
df.sample(3)

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
19667,2009.02554,Matthew Berger,Matthew Berger,Visually Analyzing Contextualized Embeddings,"IEEE Vis 2020, Observable notebook demo at\n ...",,,,cs.HC cs.CL,http://arxiv.org/licenses/nonexclusive-distrib...,In this paper we introduce a method for visu...,"[{'version': 'v1', 'created': 'Sat, 5 Sep 2020...",2020-09-08,"[[Berger, Matthew, ]]"
15657,2001.03712,Geondo Park,"Geondo Park, Chihye Han, Wonjun Yoon, Daeshik Kim",MHSAN: Multi-Head Self-Attention Network for V...,Accepted by the 2020 IEEE Winter Conference on...,,,,cs.CV cs.CL cs.LG,http://arxiv.org/licenses/nonexclusive-distrib...,Visual-semantic embedding enables various ta...,"[{'version': 'v1', 'created': 'Sat, 11 Jan 202...",2020-01-14,"[[Park, Geondo, ], [Han, Chihye, ], [Yoon, Won..."
8048,1806.07699,Vivian Silva,"Vivian S. Silva, Andr\'e Freitas, Siegfried Ha...",Word Tagging with Foundational Ontology Classe...,"13 pages, 1 figure, presented at EKAW 2016",Proceedings of the 20th International Conferen...,10.1007/978-3-319-49004-5_38,,cs.CL,http://arxiv.org/licenses/nonexclusive-distrib...,Semantic annotation is fundamental to deal w...,"[{'version': 'v1', 'created': 'Wed, 20 Jun 201...",2018-06-21,"[[Silva, Vivian S., ], [Freitas, André, ], [Ha..."


**DATA WRANGLING & MANIPULATION**

In [9]:
for col in df.columns:
    print(col)

id
submitter
authors
title
comments
journal-ref
doi
report-no
categories
license
abstract
versions
update_date
authors_parsed


**Dropping unnecessary columns**

In [10]:
new_df = df[['title', 'abstract', 'update_date']]
new_df.head(5)

Unnamed: 0,title,abstract,update_date
0,Introduction to Arabic Speech Recognition Usin...,In this paper Arabic was investigated from t...,2007-05-23
1,Arabic Speech Recognition System using CMU-Sph...,In this paper we present the creation of an ...,2007-05-23
2,An Automated Evaluation Metric for Chinese Tex...,"In this paper, we propose an automated evalu...",2013-10-29
3,On the Development of Text Input Method - Less...,Intelligent Input Methods (IM) are essential...,2007-05-23
4,Network statistics on early English Syntax: St...,This paper includes a reflection on the role...,2007-05-23


**Extracting years from update_date**

In [11]:
new_df['years'] = new_df['update_date'].str[:4]
new_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,title,abstract,update_date,years
0,Introduction to Arabic Speech Recognition Usin...,In this paper Arabic was investigated from t...,2007-05-23,2007
1,Arabic Speech Recognition System using CMU-Sph...,In this paper we present the creation of an ...,2007-05-23,2007
2,An Automated Evaluation Metric for Chinese Tex...,"In this paper, we propose an automated evalu...",2013-10-29,2013
3,On the Development of Text Input Method - Less...,Intelligent Input Methods (IM) are essential...,2007-05-23,2007
4,Network statistics on early English Syntax: St...,This paper includes a reflection on the role...,2007-05-23,2007


In [12]:
new_df['years'].value_counts()

2022    10735
2021     8144
2020     6684
2019     4911
2018     3453
2017     2207
2016     1630
2008      727
2015      726
2007      637
2014      492
2013      293
2012      200
2009      132
2011      103
2010       78
Name: years, dtype: int64

In [13]:
new_df['years'] = pd.to_numeric(new_df['years'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


**Deleting unnecessary years**

In [14]:
index_year = new_df[(new_df['years'] >= 2007) & (new_df['years'] <= 2017)].index
new_df.drop(index_year, inplace=True)
new_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,title,abstract,update_date,years
52,Valence extraction using EM selection and co-o...,This paper discusses two new procedures for ...,2020-03-11,2020
116,On the Vocabulary of Grammar-Based Codes and t...,The article presents a new interpretation fo...,2020-03-11,2020
126,The Latent Relation Mapping Engine: Algorithm ...,Many AI researchers and cognitive scientists...,2020-08-20,2020
147,The Modular Audio Recognition Framework (MARF)...,MARF is an open-source research platform and...,2019-08-14,2019
246,Les entit\'es spatiales dans la langue : \'etu...,While previous linguistic and psycholinguist...,2020-07-17,2020


**Checking...**

In [15]:
new_df['years'].value_counts()

2022    10735
2021     8144
2020     6684
2019     4911
2018     3453
Name: years, dtype: int64

In [16]:
new_df = new_df.drop(['update_date'], axis=1)

In [17]:
new_df.head(5)

Unnamed: 0,title,abstract,years
52,Valence extraction using EM selection and co-o...,This paper discusses two new procedures for ...,2020
116,On the Vocabulary of Grammar-Based Codes and t...,The article presents a new interpretation fo...,2020
126,The Latent Relation Mapping Engine: Algorithm ...,Many AI researchers and cognitive scientists...,2020
147,The Modular Audio Recognition Framework (MARF)...,MARF is an open-source research platform and...,2019
246,Les entit\'es spatiales dans la langue : \'etu...,While previous linguistic and psycholinguist...,2020


In [18]:
new_df.dtypes

title       object
abstract    object
years        int64
dtype: object

**Converting object datatype of dataframe columns to string**

In [19]:
new_df['title'] = new_df['title'].astype("string")
new_df['abstract'] = new_df['abstract'].astype("string")

**Checking...**

In [20]:
new_df.dtypes

title       string
abstract    string
years        int64
dtype: object

**Combining title+abstract texts for corpus creation**

In [21]:
new_df['text'] = new_df['title'].astype(str) +" "+ new_df['abstract'].astype(str)

In [22]:
new_df.head(5)

Unnamed: 0,title,abstract,years,text
52,Valence extraction using EM selection and co-o...,This paper discusses two new procedures for ...,2020,Valence extraction using EM selection and co-o...
116,On the Vocabulary of Grammar-Based Codes and t...,The article presents a new interpretation fo...,2020,On the Vocabulary of Grammar-Based Codes and t...
126,The Latent Relation Mapping Engine: Algorithm ...,Many AI researchers and cognitive scientists...,2020,The Latent Relation Mapping Engine: Algorithm ...
147,The Modular Audio Recognition Framework (MARF)...,MARF is an open-source research platform and...,2019,The Modular Audio Recognition Framework (MARF)...
246,Les entit\'es spatiales dans la langue : \'etu...,While previous linguistic and psycholinguist...,2020,Les entit\'es spatiales dans la langue : \'etu...


In [23]:
new_df = new_df.drop(['title', 'abstract'], axis=1)
new_df.head(5)

Unnamed: 0,years,text
52,2020,Valence extraction using EM selection and co-o...
116,2020,On the Vocabulary of Grammar-Based Codes and t...
126,2020,The Latent Relation Mapping Engine: Algorithm ...
147,2019,The Modular Audio Recognition Framework (MARF)...
246,2020,Les entit\'es spatiales dans la langue : \'etu...


**CORPUS PRE-PROCESSING**

**STEP 1: Punctuation removal**

In [24]:
import string
string.punctuation

#function that removes punctuations
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

new_df['text']= new_df['text'].apply(lambda x:remove_punctuation(x))
new_df.head(5)

Unnamed: 0,years,text
52,2020,Valence extraction using EM selection and cooc...
116,2020,On the Vocabulary of GrammarBased Codes and th...
126,2020,The Latent Relation Mapping Engine Algorithm a...
147,2019,The Modular Audio Recognition Framework MARF a...
246,2020,Les entites spatiales dans la langue etude de...


**STEP 2: Converting all text into lower-case characters**

In [25]:
new_df['text'] = new_df['text'].apply(lambda x: x.lower())
new_df.head(5)

Unnamed: 0,years,text
52,2020,valence extraction using em selection and cooc...
116,2020,on the vocabulary of grammarbased codes and th...
126,2020,the latent relation mapping engine algorithm a...
147,2019,the modular audio recognition framework marf a...
246,2020,les entites spatiales dans la langue etude de...


In [23]:
new_df.to_csv('corpus.csv')