In [1]:
# In case the nltk package is not installed, execute te following:

#! pip install stopwords
#! pip install nltk
# import nltk
# nltk.download('stopwords')

In [2]:
import pandas as pd

# 0. Loading vanilla dataset as a dataframe

In [3]:
train_df_raw = pd.read_json('data/train.json')
train_df_raw.head(5)

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue
0,0b341b6938308a6d5f47edf490f6e46eae3835fa,Detecting linguistic idiosyncratic interests i...,3188285,Masoud Rouhizadeh,Children with autism spectrum disorder often e...,2014,CLPsych@ACL
1,c682727ee058aadbe9dbf838dcb036322818f588,Bigrams and BiLSTMs Two Neural Networks for Se...,2782720,Yuri Bizzoni,We present and compare two alternative deep ne...,2018,Fig-Lang@NAACL-HLT
2,0f9b5b32229a7245e43754430c0c88f8e7f0d8af,In Factuality: Efficient Integration of Releva...,144748442,Peter Vickers,Visual Question Answering (VQA) methods aim at...,2021,ACL
3,7e8b4cfdc03b59ece2d6b33a217f0abd47f708d9,Variational Graph Autoencoding as Cheap Superv...,46331602,Irene Li,Coreference resolution over semantic graphs li...,2022,ACL
4,07588dd5d0252c7abc99b3834a81bf23741ead4b,LIMIT-BERT : Linguistics Informed Multi-Task BERT,30887404,Junru Zhou,"In this paper, we present Linguistics Informed...",2019,FINDINGS


# A. Frequency lists for abstracts and titles

### A.1 Creating an ordered list of most frequent filtered words in the abstracts

In [4]:
#Creating an ordered list of most frequent filtered words in the abstracts

import json
import re
from nltk.corpus import stopwords

# Opening JSON file, and returning the object as a list of dictionaries. Reminder: it's loading from my local path.
f = open('data/train.json',)
data = json.load(f)

# Creating a list with all the abstracts in it
# Also cleaning everything into lower case and only alphanumerical
# Change the 'abstract' to 'title' to get the information about the titles
X = []
for item in data:
    abstract = item.get('abstract')
    abstract = re.sub("[^a-zA-Z0-9 ]","",abstract)
    X.append(abstract.lower())

# Creating a list of all the words 
word_list = [word for line in X for word in line.split()]    

# Removing common irrelevant words from the word_list
stop_words = set(stopwords.words('english'))
word_list = [w for w in word_list if not w.lower() in stop_words]

filtered_sentence = []
  
for w in word_list:
    if w not in stop_words:
        filtered_sentence.append(w)

# Turning that into a frequency dictionary
frequency_list = {}
for word in filtered_sentence:
    if word not in frequency_list:
        frequency_list[word] = 0
    frequency_list[word] += 1

# And into an ordered dictionary, ordered on the frequency count
# The dictionary is currently limited to words which occur 1.000 times or more. This can be altered.
# This is then turnt into a list, so that we can refer to indexnumbers for variables
ordered = dict(sorted(frequency_list.items(), key=lambda item: item[1],reverse=True))
orderedDict_abstracts = {k:v for (k,v) in ordered.items() if v > 1000}
orderedListAbstracts = []
for item in orderedDict_abstracts:
    orderedListAbstracts.append(item)

### A.2 Creating an ordered list of most frequent filtered words in the Titles

In [5]:
#Creating an ordered list of most frequent filtered words in the Titles

import json
import re
from nltk.corpus import stopwords

# Opening JSON file, and returning the object as a list of dictionaries. Reminder: it's loading from my local path.
f = open('data/train.json',)
data = json.load(f)

# Creating a list with all the titles in it
# Also cleaning everything into lower case and only alphanumerical
# Change the 'title' to 'abstract' to get the information about the abstracts
X = []
for item in data:
    title = item.get('title')
    title = re.sub("[^a-zA-Z0-9 ]","",title)
    X.append(title.lower())

# Creating a list of all the words 
word_list = [word for line in X for word in line.split()]    

# Removing common irrelevant words from the word_list
stop_words = set(stopwords.words('english'))
word_list = [w for w in word_list if not w.lower() in stop_words]

filtered_sentence = []
  
for w in word_list:
    if w not in stop_words:
        filtered_sentence.append(w)

# Turning that into a frequency dictionary
frequency_list = {}
for word in filtered_sentence:
    if word not in frequency_list:
        frequency_list[word] = 0
    frequency_list[word] += 1

# And into an ordered dictionary, ordered on the frequency count
# The dictionary is currently limited to words which occur 100 times or more. This can be altered.
# This is then turnt into a list, so that we can refer to indexnumbers for variables
ordered = dict(sorted(frequency_list.items(), key=lambda item: item[1],reverse=True))
orderedDict_titles = {k:v for (k,v) in ordered.items() if v > 100}
orderedListTitles = []
for item in orderedDict_titles:
    orderedListTitles.append(item)

### A.3 Inspecting the results

In [6]:
x = len(orderedListTitles)
y = len(orderedListAbstracts)

print(x, y, orderedListTitles[0:10], orderedListAbstracts[0:10])

138 140 ['language', 'learning', 'neural', 'translation', 'using', 'task', 'machine', 'models', 'word', 'text'] ['model', 'models', 'language', 'task', 'data', 'paper', 'show', 'results', 'system', 'performance']


In [7]:
# Full dictionaries of word frequencies over determined thresholds
# orderedDict_titles
# orderedDict_abstracts