In [1]:
# In case the nltk package is not installed, execute te following:

#! pip install stopwords
#! pip install nltk
# import nltk
# nltk.download('stopwords')

In [2]:
import pandas as pd
import numpy as np
import pickle
from mlxtend.frequent_patterns import apriori, association_rules

# 0. Loading vanilla dataset as a dataframe

In [3]:
train_df_raw = pd.read_json('data/train.json')
train_df_raw.head(5)

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue
0,0b341b6938308a6d5f47edf490f6e46eae3835fa,Detecting linguistic idiosyncratic interests i...,3188285,Masoud Rouhizadeh,Children with autism spectrum disorder often e...,2014,CLPsych@ACL
1,c682727ee058aadbe9dbf838dcb036322818f588,Bigrams and BiLSTMs Two Neural Networks for Se...,2782720,Yuri Bizzoni,We present and compare two alternative deep ne...,2018,Fig-Lang@NAACL-HLT
2,0f9b5b32229a7245e43754430c0c88f8e7f0d8af,In Factuality: Efficient Integration of Releva...,144748442,Peter Vickers,Visual Question Answering (VQA) methods aim at...,2021,ACL
3,7e8b4cfdc03b59ece2d6b33a217f0abd47f708d9,Variational Graph Autoencoding as Cheap Superv...,46331602,Irene Li,Coreference resolution over semantic graphs li...,2022,ACL
4,07588dd5d0252c7abc99b3834a81bf23741ead4b,LIMIT-BERT : Linguistics Informed Multi-Task BERT,30887404,Junru Zhou,"In this paper, we present Linguistics Informed...",2019,FINDINGS


# A. Frequency lists for abstracts and titles

### A.1 Creating an ordered list of most frequent filtered words in the abstracts

In [4]:
#Creating an ordered list of most frequent filtered words in the abstracts

import json
import re
from nltk.corpus import stopwords

# Opening JSON file, and returning the object as a list of dictionaries. Reminder: it's loading from my local path.
f = open('data/train.json',)
data = json.load(f)

# Creating a list with all the abstracts in it
# Also cleaning everything into lower case and only alphanumerical
# Change the 'abstract' to 'title' to get the information about the titles
X = []
for item in data:
    abstract = item.get('abstract')
    abstract = re.sub("[^a-zA-Z0-9 ]","",abstract)
    X.append(abstract.lower())

# Creating a list of all the words 
word_list = [word for line in X for word in line.split()]    

# Removing common irrelevant words from the word_list
stop_words = set(stopwords.words('english'))
word_list = [w for w in word_list if not w.lower() in stop_words]

filtered_sentence = []
  
for w in word_list:
    if w not in stop_words:
        filtered_sentence.append(w)

# Turning that into a frequency dictionary
frequency_list = {}
for word in filtered_sentence:
    if word not in frequency_list:
        frequency_list[word] = 0
    frequency_list[word] += 1

# And into an ordered dictionary, ordered on the frequency count
# The dictionary is currently limited to words which occur 1.000 times or more. This can be altered.
# This is then turnt into a list, so that we can refer to indexnumbers for variables
ordered = dict(sorted(frequency_list.items(), key=lambda item: item[1],reverse=True))
orderedDict_abstracts = {k:v for (k,v) in ordered.items() if v > 1000}
orderedListAbstracts = []
for item in orderedDict_abstracts:
    orderedListAbstracts.append(item)

### A.2 Creating an ordered list of most frequent filtered words in the Titles

In [5]:
#Creating an ordered list of most frequent filtered words in the Titles

import json
import re
from nltk.corpus import stopwords

# Opening JSON file, and returning the object as a list of dictionaries. Reminder: it's loading from my local path.
f = open('data/train.json',)
data = json.load(f)

# Creating a list with all the titles in it
# Also cleaning everything into lower case and only alphanumerical
# Change the 'title' to 'abstract' to get the information about the abstracts
X = []
for item in data:
    title = item.get('title')
    title = re.sub("[^a-zA-Z0-9 ]","",title)
    X.append(title.lower())

# Creating a list of all the words 
word_list = [word for line in X for word in line.split()]    

# Removing common irrelevant words from the word_list
stop_words = set(stopwords.words('english'))
word_list = [w for w in word_list if not w.lower() in stop_words]

filtered_sentence = []
  
for w in word_list:
    if w not in stop_words:
        filtered_sentence.append(w)

# Turning that into a frequency dictionary
frequency_list = {}
for word in filtered_sentence:
    if word not in frequency_list:
        frequency_list[word] = 0
    frequency_list[word] += 1

# And into an ordered dictionary, ordered on the frequency count
# The dictionary is currently limited to words which occur 100 times or more. This can be altered.
# This is then turnt into a list, so that we can refer to indexnumbers for variables
ordered = dict(sorted(frequency_list.items(), key=lambda item: item[1],reverse=True))
orderedDict_titles = {k:v for (k,v) in ordered.items() if v > 100}
orderedListTitles = []
for item in orderedDict_titles:
    orderedListTitles.append(item)

### A.3 Inspecting the results

In [6]:
x = len(orderedListTitles)
y = len(orderedListAbstracts)

print(x, y, orderedListTitles[0:10], orderedListAbstracts[0:10])

138 140 ['language', 'learning', 'neural', 'translation', 'using', 'task', 'machine', 'models', 'word', 'text'] ['model', 'models', 'language', 'task', 'data', 'paper', 'show', 'results', 'system', 'performance']


In [7]:
# Full dictionaries of word frequencies over determined thresholds
# orderedDict_titles
# orderedDict_abstracts

# B. Frequency vectors for titles and abstracts and for each paper

__Important__: Note that the list of keywords for both abstract and titles employed in this section are the same as those defined in the previous one (orderedListTitles and orderedListAbstracts) so modifying those above and then re-running the code below will produce the obtention of different outcomes.

Two dictionaries are created to store the number of appearences of a given keyword in titles and abstracts respectively. Subsequently a function is defined to "reset" this dictionary given that such action will have to be carried out for each instance of the dataset. (Each dictionary is meant to store ocurrences of abstract and titles of a single instance, the reseted and go on to the next one)

In [8]:
#zero_titles = np.zeros(len(orderedListTitles))
freq_counter_title = {}
freq_counter_abstract = {}

#frequency dictionary to store appearences of keyowrds (titles)
def reset_freq_vectors():
    '''
    Function to reset the values of the frequency dictionaries that generate the vectors for titles and abstracts
    '''
    for i in orderedListTitles:
        freq_counter_title[i] = 0
    for i in orderedListAbstracts:
        freq_counter_abstract[i] = 0
    #print("freq. vectors succesfully reseted")

reset_freq_vectors()

### B.1 Vectors of absolute frequency of keywords

Now two new dictionaries are created to store the respective vectors of each instance. Then the loop will look for and then count occurrences of keywords for both abstracts and titles.

In [9]:
# These two dictionaries will contain one vector for each instance of the set (title and abstract).
all_title_freq_vectors = {}
all_abstract_freq_vectors = {}


# Looping through the entire dataset
for i in range(12129):
    
    if i % 1000 == 0:
        print(f"Current iteration: {i}")
    
    # Comparing titles and titles keywords to check for coincidences and their frequency
    title = train_df_raw.iloc[i]['title'].lower()
    
    for a in orderedListTitles:
        for e in train_df_raw.iloc[i]['title'].lower().split():
            if a == e:
                freq_counter_title[a] += 1
                
    all_title_freq_vectors[i] = np.array(list(freq_counter_title.values()))
    
    
    # Comparing abstracts and abstracts keywords to check for coincidences and their frequency
    abstract = train_df_raw.iloc[i]['abstract'].lower()  
    
    for a in orderedListAbstracts:
        for e in train_df_raw.iloc[i]['abstract'].lower().split():
            if a == e:
                freq_counter_abstract[a] += 1
                
    all_abstract_freq_vectors[i] = np.array(list(freq_counter_abstract.values()))
    
    reset_freq_vectors()

Current iteration: 0
Current iteration: 1000
Current iteration: 2000
Current iteration: 3000
Current iteration: 4000
Current iteration: 5000
Current iteration: 6000
Current iteration: 7000
Current iteration: 8000
Current iteration: 9000
Current iteration: 10000
Current iteration: 11000
Current iteration: 12000


### B.2 Binary vectors of presence of keywords

Instead of iterating through the whole dataset again, the frequency vectors for titles and abstract are taken and all non zero values are converted ones while respecting the existing structure.

In [10]:
all_title_binary_vectors = {}
all_abstract_binary_vectors = {}

for i in range(12129):
    
    if i % 1000 == 0:
        print(f"Current iteration: {i}")
    
    transitional_title = []
    
    element_a = all_title_freq_vectors.get(i)
    for e in element_a:
        if e != 0:
            transitional_title.append(1)
        else:
            transitional_title.append(0)
    all_title_binary_vectors[i] = np.array(transitional_title)
    
    transitional_abstract = []
    
    element_a = all_abstract_freq_vectors.get(i)
    for e in element_a:
        if e != 0:
            transitional_abstract.append(1)
        else:
            transitional_abstract.append(0)
    all_abstract_binary_vectors[i] = np.array(transitional_abstract)       

Current iteration: 0
Current iteration: 1000
Current iteration: 2000
Current iteration: 3000
Current iteration: 4000
Current iteration: 5000
Current iteration: 6000
Current iteration: 7000
Current iteration: 8000
Current iteration: 9000
Current iteration: 10000
Current iteration: 11000
Current iteration: 12000


### B.3 Saving (and then loading) the relevant dictionaries as pickle files 

Given the relatively high cost of executing the code above, the produced dictionaries containing the vectors are saved as pickle files and then re-loaded (with different names) for subsequent use in the coming sections if needed.

Code from: https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object

In [11]:
###############
#Saving data
###############

with open('data/freq_titles.pickle', 'wb') as fp:
    pickle.dump(all_title_freq_vectors, fp)
    
with open('data/freq_abstracts.pickle', 'wb') as fp:
    pickle.dump(all_abstract_freq_vectors, fp)
    
with open('data/binary_titles.pickle', 'wb') as fp:
    pickle.dump(all_title_binary_vectors, fp)
    
with open('data/binary_abstracts.pickle', 'wb') as fp:
    pickle.dump(all_abstract_binary_vectors, fp)    

    
    
###############
#Loading data
###############    
        
with open('data/freq_titles.pickle', 'rb') as handle:
    freq_vector_title = pickle.load(handle)
    
with open('data/freq_abstracts.pickle', 'rb') as handle:
    freq_vector_abstract = pickle.load(handle)
    
with open('data/binary_titles.pickle', 'rb') as handle:
    binary_vector_title = pickle.load(handle)
    
with open('data/binary_abstracts.pickle', 'rb') as handle:
    binary_vector_abstract = pickle.load(handle)

### B.4 Including frequency vectors in raw df (abtracs absolute frequency in this case)

Creating a dictionary to store the values of the abstarct frequency vectors by keyword so the vectors can be added as one hot econdoed variables to the training dataframe

In [12]:
one_hot_dict = {}
for i in range(len(freq_vector_abstract.get(0))):

    circumstantial_list = []
    for e in freq_vector_abstract.keys():
        circumstantial_list.append(freq_vector_abstract.get(e)[i])
    one_hot_dict[i] = circumstantial_list

Creating a dictionary to store the values of the abstarct binary vectors by keyword so the vectors can be added as one hot econdoed variables to the training dataframe

In [13]:
one_hot_dict_binary = {}
for i in range(len(binary_vector_abstract.get(0))):

    circumstantial_list = []
    for e in binary_vector_abstract.keys():
        circumstantial_list.append(binary_vector_abstract.get(e)[i])
    one_hot_dict_binary[i] = circumstantial_list

Adding the new features to a copy of the original dataframe and then dropping all other features (Some features deleted in the process for observational simplicity)

In [14]:
one_hot_abstract_df = train_df_raw.copy()

for i in range(len(freq_vector_abstract.get(0))):
    one_hot_abstract_df[orderedListAbstracts[i]] = one_hot_dict_binary.get(i)

  one_hot_abstract_df[orderedListAbstracts[i]] = one_hot_dict_binary.get(i)


In [15]:
one_hot_abstract_df.drop(['title', 'abstract', 'venue', 'authorName'], axis=1)

Unnamed: 0,paperId,authorId,year,model,models,language,task,data,paper,show,...,learn,various,experimental,understanding,automatically,often,discourse,prediction,significantly,standard
0,0b341b6938308a6d5f47edf490f6e46eae3835fa,3188285,2014,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,c682727ee058aadbe9dbf838dcb036322818f588,2782720,2018,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0f9b5b32229a7245e43754430c0c88f8e7f0d8af,144748442,2021,1,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,7e8b4cfdc03b59ece2d6b33a217f0abd47f708d9,46331602,2022,1,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,1,0
4,07588dd5d0252c7abc99b3834a81bf23741ead4b,30887404,2019,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12124,5868a7bfe6a4590d332ca66b8097dbe5490c8a73,2001128224,2020,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
12125,6fbfa7138235b99df43391bff3917b85393c3ca1,3209288,2016,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12126,7226d14e6dea73dfad521256248ec2b19ae66ad8,144254013,2021,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12127,cb0f3ee1e98faf92429d601cdcd76c69c1e484eb,46236380,2018,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


### B.5 Employing association rules (apriori algorithm) to find the most common combinations of words

In [16]:
assoc_rule_df = one_hot_abstract_df.copy()
assoc_rule_df_lean = assoc_rule_df.drop(['paperId','title','authorId','authorName','abstract','year','venue'], axis=1)




In [17]:
assoc_show = apriori(assoc_rule_df_lean, min_support = 0.05, use_colnames = True, verbose = 1)



Processing 200 combinations | Sampling itemset size 4 3


In [18]:
pd.set_option('display.max_rows', assoc_show.shape[0]+1)


assoc_show

Unnamed: 0,support,itemsets
0,0.372001,(model)
1,0.304724,(models)
2,0.344381,(language)
3,0.259461,(task)
4,0.22912,(data)
5,0.226152,(paper)
6,0.350565,(show)
7,0.29244,(results)
8,0.182373,(system)
9,0.22541,(performance)


In [19]:
assoc_rule_df_lean

Unnamed: 0,model,models,language,task,data,paper,show,results,system,performance,...,learn,various,experimental,understanding,automatically,often,discourse,prediction,significantly,standard
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0
4,1,0,1,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12124,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12125,0,0,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
12126,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
12127,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### B.extra  Obtaining the mean vector for each author

In [21]:
# Adding frequency vectors to train dataset

train_df_raw["key_freq_abs"] = freq_vector_abstract.values()
train_df_raw["key_freq_title"] = freq_vector_title.values()

In [22]:
#ABSTRACTS

#dictionary -> author : average_vector_abstract
author_vector_abs = {}

for i in train_df_raw['authorName'].unique():
    
    end_vec = np.zeros(len(freq_vector_abstract.get(0)))
    array_list = []
    
    for e in (train_df_raw[train_df_raw["authorName"] == i].index):
        
        array_list.append(freq_vector_abstract[e]) 
    author_vector_abs[i] = sum(array_list)/len(array_list)
        
#TITLES

#dictionary -> author : average_vector_title
author_vector_title = {}

for i in train_df_raw['authorName'].unique():
    
    end_vec = np.zeros(len(freq_vector_title.get(0)))
    array_list = []
    
    for e in (train_df_raw[train_df_raw["authorName"] == i].index):
        
        
        array_list.append(freq_vector_title[e]) 
    author_vector_title[i] = sum(array_list)/len(array_list)

In [23]:
# Dictionaries with the "mean keywords vectors" for both abstracts and tiles of each author

#author_vector_abs 
#author_vector_title

In [24]:
author_vector_abs.get("Alex Warstadt")

array([0.33333333, 3.66666667, 0.33333333, 0.        , 0.66666667,
       0.33333333, 0.        , 0.        , 0.        , 0.33333333,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.66666667, 0.        , 0.33333333, 0.        ,
       0.33333333, 0.        , 0.66666667, 0.33333333, 0.33333333,
       0.        , 0.        , 0.        , 1.        , 1.        ,
       0.33333333, 1.66666667, 0.        , 0.        , 0.        ,
       0.        , 0.66666667, 0.        , 0.        , 0.        ,
       0.66666667, 0.        , 0.        , 0.33333333, 0.        ,
       0.        , 0.        , 0.        , 0.33333333, 0.        ,
       0.33333333, 0.33333333, 0.66666667, 0.        , 0.        ,
       0.        , 0.        , 0.33333333, 0.33333333, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.33333333, 0.33333333, 0.     

In [25]:
train_df_raw[train_df_raw["authorName"] == "Alex Warstadt"]

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue,key_freq_abs,key_freq_title
7486,055fac05cd424e7b1bdcd359ff7980ca8d938ef3,Learning Which Features Matter: RoBERTa Acquir...,46236380,Alex Warstadt,One reason pretraining on self-supervised ling...,2020,EMNLP,"[1, 5, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
10196,3cd331c997e90f737810aad6fcce4d993315189f,Investigating BERT’s Knowledge of Language: Fi...,46236380,Alex Warstadt,Though state-of-the-art sentence representatio...,2019,EMNLP,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12127,cb0f3ee1e98faf92429d601cdcd76c69c1e484eb,Neural Network Acceptability Judgments,46236380,Alex Warstadt,Abstract This paper investigates the ability o...,2018,Transactions of the Association for Computatio...,"[0, 5, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
