In [1]:
import pickle

In [2]:
import os
data = {}
base_folder = '/scratch/datasets/mog29/unarXive'
filenames = ['paper_to_section_metadata_0_23.pkl', 'paper_to_section_metadata_23_29.pkl',
            'paper_to_section_metadata_29_32.pkl']

for filename in filenames:
    with open(os.path.join(base_folder, filename), 'rb') as f:
        curr_data = pickle.load(f)
    for key, val in curr_data.items():
        data[key] = val

VALID_DISCIPLINES = ["cs.AI", "cs.CL", "cs.CV", "cs.LG", "stat.ML"]

In [3]:
from collections import Counter

def process_section_name(input_text):
    lowercase_text = input_text.lower()
    replaced_text = lowercase_text.replace('\n', ' ')
    return replaced_text

def get_most_common_section_names(data, category_list):
    name_to_count = Counter()
    num_to_count = Counter()
    for _, metadata_dict in data.items():
        # Choose whether to filter the paper
        if category_list is not None:
            paper_cats = metadata_dict['categories']
            found_match = any([cat in category_list for cat in paper_cats])
            if not found_match:
                continue
        
        section_name_set = set()
        section_num_set = set()
        for section_name, section_num in metadata_dict['name_number_pairs']:
            if section_name is None or section_num is None:
                continue

            proc_section_name = process_section_name(section_name)
            section_name_set.add(proc_section_name)
            section_num_set.add(section_num)

        for section_name in section_name_set:
            name_to_count[section_name] += 1
        for section_num in section_num_set:
            num_to_count[section_num] += 1
            
    return name_to_count, num_to_count
            

In [20]:
" ".split('.')[:1]

[' ']

In [6]:
name_to_count, num_to_count = get_most_common_section_names(data, VALID_DISCIPLINES)
sorted_name_to_count = sorted(list(name_to_count.items()), reverse=True, key=lambda x: x[1])
print(sorted_name_to_count[:80])

[('introduction', 200458), ('conclusion', 104881), ('related work', 75358), ('experiments', 50217), ('conclusions', 31980), ('results', 31037), ('acknowledgements', 28743), ('discussion', 26693), ('acknowledgments', 26122), ('datasets', 18744), ('implementation details', 18024), ('experimental setup', 14077), ('experimental results', 13574), ('ablation study', 13368), ('method', 12761), ('methodology', 12373), ('acknowledgment', 11671), ('preliminaries', 11226), ('dataset', 11015), ('conclusion and future work', 10680), ('background', 10552), ('acknowledgement', 9782), ('related works', 9780), ('evaluation', 9343), ('problem formulation', 8181), ('evaluation metrics', 7823), ('methods', 7462), ('baselines', 6751), ('training', 6027), ('data', 5986), ('overview', 5891), ('conclusions and future work', 5808), ('proof of theorem ', 5562), ('results and discussion', 5470), ('proposed method', 5068), ('proof of theorem\xa0', 4889), ('ablation studies', 4811), ('implementation', 4560), ('mai

In [18]:
def get_most_common_parent_section_names(data, category_list):
    name_to_count = Counter()
    for _, metadata_dict in data.items():
        # Choose whether to filter the paper
        if category_list is not None:
            paper_cats = metadata_dict['categories']
            found_match = any([cat in category_list for cat in paper_cats])
            if not found_match:
                continue
        
        section_name_set = set()
        for section_name, section_num in metadata_dict['name_number_pairs']:
            if section_name is None or section_num is None:
                continue
                
            section_subsection = section_num.split('.')
            singleton = len(section_subsection) == 1
            hidden_singleton = len(section_subsection) == 2 and section_subsection[1] == ''
            if not (singleton or hidden_singleton):
                continue
                
            proc_section_name = process_section_name(section_name)
            section_name_set.add(proc_section_name)

        for section_name in section_name_set:
            name_to_count[section_name] += 1
            
    return name_to_count

In [19]:
name_to_count = get_most_common_parent_section_names(data, VALID_DISCIPLINES)
sorted_name_to_count = sorted(list(name_to_count.items()), reverse=True, key=lambda x: x[1])
print(sorted_name_to_count[:80])

[('introduction', 200375), ('conclusion', 104428), ('related work', 70740), ('experiments', 49311), ('conclusions', 31828), ('acknowledgements', 28067), ('acknowledgments', 25659), ('discussion', 24018), ('results', 21436), ('method', 12401), ('methodology', 11872), ('acknowledgment', 11604), ('experimental results', 11526), ('conclusion and future work', 10647), ('implementation details', 9816), ('datasets', 9742), ('background', 9674), ('acknowledgement', 9578), ('preliminaries', 9476), ('related works', 9040), ('experimental setup', 8583), ('evaluation', 7002), ('methods', 6950), ('dataset', 6521), ('ablation study', 6285), ('problem formulation', 6183), ('conclusions and future work', 5789), ('proposed method', 4927), ('results and discussion', 4578), ('evaluation metrics', 4446), ('appendix', 4327), ('experiments and results', 4311), ('approach', 4156), ('overview', 3572), ('data', 3428), ('experiment', 3413), ('discussion and conclusion', 3121), ('limitations', 3069), ('proof of 

In [24]:
def get_most_common_section_after(data, target_name, category_list):
    name_to_count = Counter()
    for _, metadata_dict in data.items():
        # Choose whether to filter the paper
        if category_list is not None:
            paper_cats = metadata_dict['categories']
            found_match = any([cat in category_list for cat in paper_cats])
            if not found_match:
                continue
        
        all_pairs = sorted(list(set(metadata_dict['name_number_pairs'])), key=lambda x: x[1])

        section_idx = -1
        curr_val = -1
        for curr_name, curr_num in all_pairs:
            curr_val += 1
            if curr_name is None: 
                continue

            proc_curr_name = process_section_name(curr_name)

            if target_name in proc_curr_name:
                section_idx = curr_val
                break
                
        if section_idx != -1 and section_idx != len(all_pairs) - 1:
            next_section = all_pairs[section_idx + 1][0]
            if next_section is None:
                continue
            
            proc_section_name = process_section_name(next_section)
            name_to_count[proc_section_name] += 1
            
    return name_to_count

In [28]:
name_to_count = get_most_common_section_after(data, 'background', VALID_DISCIPLINES)
sorted_name_to_count = sorted(list(name_to_count.items()), reverse=True, key=lambda x: x[1])
print(sorted_name_to_count[:40])

[('a simple example', 1), ('programming languages as analogues, not metaphors', 1), ('ambiguity resolution', 1), ('overview of compere', 1), ('abstraction and focus', 1), ('extending the clustering algorithm to {{formula:738dbf0c-1e0c-4154-be09-3fc7e02b91e0}} -grams', 1), ('a family of categorial calculi and their linguistic applications', 1), ('relating project internal and comparative assessment', 1), ('the knowledge grapher', 1), ('test data maintenance and retrieval', 1), ('an overview of the thesis', 1), ('the linguistic data', 1), ('code format', 1), ('adaptive clustering', 1), ('the grammar (set)', 1), ('the problem', 1), ('two-layered architecture', 1), ('relations between discourse segments', 1), ('optimality theory', 1), ('purpose factors', 1), ('shallow parsers with hand-written rules', 1), ('a time-local index', 1), ('linear combining of biased classifiers', 1), ('results', 1), ('the corresponding decision problems', 1), ('distributed artificial intelligence', 1), ('formal 

In [3]:
keys = list(data.keys())
num_keys = len(keys)

In [20]:
import random
random_paper = keys[random.randint(0, num_keys-1)]
print(f"Paper categories: {data[random_paper]['categories']}")
print(f"Paper section names: {set(data[random_paper]['name_number_pairs'])}")

Paper categories: ['quant-ph']
Paper section names: {('Adiabatic-transfer state', '2'), ('Late-time eigenvalues', '2'), ('The case of vanishing {{formula:ebdacfec-a5be-4f37-b322-8d5813f7687b}}  and {{formula:b0bd1f31-ebbc-448f-b495-37596914250a}}', '4'), ('Examples', '3'), ('Condition for a zero eigenvalue', '1'), ('Connectivity', '3'), ('Basic STIRAP', '1'), ('Arbitrary couplings', '3'), ('Arbitrary couplings', '2'), ('Proportional couplings', '2'), ('Proportional couplings', '1'), ('Introduction', '1'), ('Early-time eigenvalues', '1'), ('Nonzero eigenvalue', '2'), ('Acknowledgments', '-1'), ('The system', '1'), ('The case of vanishing {{formula:bacdc82a-fee2-4e90-89bb-d1603170b333}} , {{formula:1de67265-7c47-4d9f-bd12-b4d84c8638c1}} , and {{formula:24dea737-caf3-45b7-970c-33e3e5b1e5f9}}', '4'), ('The off-resonance case', '1'), ('Discussion and conclusions', '7'), ('The off-resonance case', '3'), ('Connectivity and AT condition', '3'), ('Degenerate resonant intermediate states', '5'),

In [23]:
VALID_DISCIPLINES = ["cs.AI", "cs.CL", "cs.CV", "cs.LG", "stat.ML"]
ml_papers = []
for key, metadata_dict in data.items():
    paper_cats = metadata_dict['categories']
    found_match = any([cat in VALID_DISCIPLINES for cat in paper_cats])
    if found_match:
        ml_papers.append(key)

In [32]:
num_ml_papers = len(ml_papers)
random_paper = ml_papers[random.randint(0, num_ml_papers-1)]
print(f"Paper categories: {data[random_paper]['categories']}")
print(f"Paper section names: {set(data[random_paper]['name_number_pairs'])}")

Paper categories: ['cs.CL']
Paper section names: {('A Compact Architecture for Dialogue Management Based on Scripts\nand Meta-Outputs', '4'), ('Using meta-outputs to choose between dialogue management moves', '4.3'), ('Fallible Interpretation: Outputs and Meta-outputs', '2.2'), ('Examples', '5'), ('Summary', '6'), ('A Prototype Implementation', '3'), ('Integration of plan evaluation, plan execution and dialogue management', '4.1'), ('Using meta-outputs to choose between interpretations', '4.2'), ('Scripts vs Logical Forms', '2.1'), ('How Meta-outputs Participate in the Translation', '3.2'), ('Levels of Representation', '3.1'), ('Introduction', '1')}
