# easy2edm

This notebook creates proceedings for the [International Conference on Educational Data Mining](https://educationaldatamining.org/conferences/) using reviewing data from [EasyChair](https://easychair.org).

See the [README](README.md) for how to download and structure the data.

## Utility functions

In [13]:
# easy2acl.py - Convert data from EasyChair for use with ACLPUB
#
# Original Author: Nils Blomqvist
# Forked/modified by: Asad Sayeed
# Further modifications and docs (for 2019 Anthology): Matt Post
# Index for LaTeX book proceedings: Mehdi Ghanimifard and Simon Dobnik
# Modified for EDM by Andrew Olney 
# Please see the documentation in the README file at http://github.com/acl-org/easy2acl.

import os
import re
import sys

from csv import DictReader
from glob import glob
from shutil import copy, rmtree
from unicode_tex import unicode_to_tex
from pybtex.database import BibliographyData, Entry
from PyPDF2 import PdfFileReader
from functools import cmp_to_key

# Specify conference tracks  here IN PROCEEDINGS ORDER
tracks =  ['long-papers','short-papers','posters','doctoral-consortium','industry-track','workshop-tutorials']

# specify decision types here
decisions = {
    "Accept in current track" : None,
    "Accept+move to short" :"short-papers",
    "Accept+move to posters":"posters",
    "ACCEPT":None
}

# for later sorting papers by track
# (submission_id, title, authors, track)
track_order = ['front'] + tracks 
def paper_cmp(a, b):
    if track_order.index(a[3]) > track_order.index(b[3]):
        return 1
    elif a[3] == b[3]:
        if int(a[0]) > int(b[0]):
            return 1
        else:
            return -1
    else:
        return -1
paper_cmp_key = cmp_to_key(paper_cmp)

def texify(string):
    """Return a modified version of the argument string where non-ASCII symbols have
    been converted into LaTeX escape codes.

    """
    return ' '.join(map(unicode_to_tex, string.split())).replace(r'\textquotesingle', "'")

def get_track_metadata(directory):
    #,----
    #| Metadata
    #`----
    metadata = { 'chairs': [] }
    with open(os.path.join(directory, 'meta')) as metadata_file:
        for line in metadata_file:
            key, value = line.rstrip().split(maxsplit=1)
            if key == 'chairs':
                metadata[key].append(value)
            else:
                metadata[key] = value

    for key in 'abbrev volume title shortbooktitle booktitle month year location publisher chairs'.split():
        if key not in metadata:
            print('Fatal: missing key "{}" from "meta" file'.format(key))
            print("Please see the documentation at https://acl-org.github.io/ACLPUB/anthology.html.")
            sys.exit(1)

    for key in "bib_url volume_name short_booktitle type".split():
        if key in metadata:
            print('Fatal: bad key "{}" in the "meta" file'.format(key))
            print("Please see the documentation at https://acl-org.github.io/ACLPUB/anthology.html.")
            sys.exit(1)

    venue = metadata["abbrev"]
    volume_name = metadata["volume"]
    year = metadata["year"]
    return metadata

def collect_track_metadata():
    metadata ={}
    for d in tracks :
        metadata[d] = get_track_metadata(d)
    return metadata

# Across all tracks, build a dictionary of submissions (which has author 
# information). We do this across tracks because some submissions have 
# decisions that move them to other tracks 

def collect_submissions_and_acceptances( decision_map, metadata ):
    submissions = {}
    for d in tracks :
        with open(os.path.join(d, 'submissions')) as submissions_file:
            for line in submissions_file:
                entry = line.rstrip().split("\t")
                submission_id = entry[0]
                authors = entry[1].replace(' and', ',').split(', ')
                title = entry[2]

                submissions[submission_id] = (title, authors)
            print("Found ", len(submissions), " submitted files in ", d)

    #
    # Append each accepted submission, as a tuple, to the 'accepted' list.
    # Order in this file is used to determine program order.
    #
    accepted = []
    for d in tracks :
        with open(os.path.join(d, 'accepted')) as accepted_file:
            for line in accepted_file:
                entry = line.rstrip().split("\t")
                # modified here to filter out the rejected files rather than doing
                # that by hand
                #if entry[-1] == 'ACCEPT':
                if entry[-1] in decision_map:
                    #print(d)
                    submission_id = entry[0]
                    title = entry[1]
                    authors = submissions[submission_id][1]
                    # if we defined an explicit mapping, use it
                    if decision_map[ entry[-1] ]:
                        track = decision_map[ entry[-1] ]
                    # otherwise we should place in current track
                    else:
                        track = d

                    accepted.append((submission_id, title, authors, track))
            print("Found ", len(accepted), " accepted files in ", d)

    # Read abstracts
    abstracts = {}
    for d in tracks :
        if os.path.exists(os.path.join(d, 'submission.csv')):
            with open(os.path.join(d, 'submission.csv')) as csv_file:
                d = DictReader(csv_file)
                for row in d:
                    abstracts[row['#']] = row['abstract']
            print('Found ', len(abstracts), 'abstracts in ',d)
        else:
            print('No abstracts available.')

    #
    # Find all relevant PDFs
    #
    venue = metadata['long-papers']["abbrev"]
    year = metadata['long-papers']["year"]
    booktitle = metadata['long-papers']['booktitle']
    chairs = metadata['long-papers']['chairs']
    
    # The PDF of the full proceedings
    full_pdf_file = 'pdf/{}_{}.pdf'.format(venue, year)
    if not os.path.exists(full_pdf_file):
        print("Fatal: could not find full volume PDF '{}'".format(full_pdf_file))
        sys.exit(1)

    # The PDF of the frontmatter
    frontmatter_pdf_file = 'pdf/{}_{}_frontmatter.pdf'.format(venue, year)
    if not os.path.exists(frontmatter_pdf_file):
        print("Fatal: could not find frontmatter PDF file '{}'".format(frontmatter_pdf_file))
        sys.exit(1)

    # File locations of all PDFs (seeded with PDF for frontmatter)
    pdfs = { '0': frontmatter_pdf_file }
    for d in tracks :
        for pdf_file in glob(os.path.join(d,'pdf/{}_{}_paper_*.pdf'.format(venue, year))):
            submission_id = pdf_file.split('_')[-1].replace('.pdf', '')
            pdfs[submission_id] = pdf_file

    # List of accepted papers (seeded with frontmatter)
    accepted.insert(0, ('0', booktitle, chairs, 'front'))
    return (submissions, accepted, abstracts, pdfs)

#
# Create Anthology tarball
#

def render_bibtex_and_track_assigned_pdf(metadata, submissions, accepted, abstracts, pdfs):
    

    # All this information is shared across tracks, so we can use long-papers
    venue = metadata['long-papers']["abbrev"]
    year = metadata['long-papers']["year"]
    booktitle = metadata['long-papers']['booktitle']
    chairs = metadata['long-papers']['chairs']
    # volume name is track name
    #volume_name = metadata['long-papers']["volume"]
    location = metadata['long-papers']["location"]
    publisher = metadata['long-papers']["publisher"]
    month= metadata['long-papers']["month"]
    
    # Create destination directories
    for dir in ['bib', 'pdf']:
        dest_dir = os.path.join('proceedings/cdrom', dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

    # Copy over "meta" file
    print('COPYING long papers meta -> proceedings/meta', file=sys.stderr)
    copy('long-papers/meta', 'proceedings/meta')

    final_bibs = []
    start_page = 1
    acepted = accepted.sort(key=paper_cmp_key)
    for paper_id, entry in enumerate(accepted):
        #print( entry)
        submission_id, paper_title, authors, track = entry
        authors = ' and '.join(authors)
        if not submission_id in pdfs:
            print('Fatal: no PDF found for paper', paper_id, file=sys.stderr)
            sys.exit(1)

        pdf_path = pdfs[submission_id]
        dest_path = 'proceedings/cdrom/pdf/{}.{}-{}.{}.pdf'.format(year, venue,track, paper_id)

        copy(pdf_path, dest_path)
        print('COPYING', pdf_path, '->', dest_path, file=sys.stderr)

        bib_path = dest_path.replace('pdf', 'bib')
        if not os.path.exists(os.path.dirname(bib_path)):
            os.makedirs(os.path.dirname(bib_path))

        anthology_id = os.path.basename(dest_path).replace('.pdf', '')

        bib_type = 'inproceedings' if submission_id != '0' else 'proceedings'
        bib_entry = Entry(bib_type, [
            ('author', authors),
            ('title', paper_title),
            ('year', year ),
            ('month',month),
            ('address', location),
            ('publisher', publisher),
        ])

        # Add page range if not frontmatter
        if paper_id > 0:
            with open(pdf_path, 'rb') as in_:
                file = PdfFileReader(in_)
                last_page = start_page + file.getNumPages() - 1
                bib_entry.fields['pages'] = '{}--{}'.format(start_page, last_page)
                start_page = last_page + 1

        # Add the abstract if present
        if submission_id in abstracts:
            bib_entry.fields['abstract'] = abstracts.get(submission_id)

        # Add booktitle for non-proceedings entries
        if bib_type == 'inproceedings':
            bib_entry.fields['booktitle'] = booktitle

        try:
            bib_string = BibliographyData({ anthology_id: bib_entry }).to_string('bibtex')
        except TypeError as e:
            print('Fatal: Error in BibTeX-encoding paper', submission_id, file=sys.stderr)
            sys.exit(1)
        final_bibs.append(bib_string)
        with open(bib_path, 'w') as out_bib:
            print(bib_string, file=out_bib)
            print('CREATED', bib_path)
    return final_bibs
            
def make_book(metadata, accepted, pdfs, final_bibs):
    # All this information is shared across tracks, so we can use long-papers
    venue = metadata['long-papers']["abbrev"]
    year = metadata['long-papers']["year"]
    booktitle = metadata['long-papers']['booktitle']
    chairs = metadata['long-papers']['chairs']
    # volume name is track name
    #volume_name = metadata['long-papers']["volume"]
    location = metadata['long-papers']["location"]
    publisher = metadata['long-papers']["publisher"]
    month= metadata['long-papers']["month"]
    
    # Create an index for LaTeX book proceedings
    if not os.path.exists('book-proceedings'):
        os.makedirs('book-proceedings')

    with open('book-proceedings/all_papers.tex', 'w') as book_file:
        for entry in accepted:
            submission_id, paper_title, authors, track = entry
            if submission_id == '0':
                continue
            if len(authors) > 1:
                authors = ', '.join(authors[:-1]) + ' and ' + authors[-1]
            else:
                authors = authors[0]

            print("""\goodpaper{{../{pdf_file}}}{{{title}}}%
    {{{authors}}}\n""".format(authors=texify(authors), pdf_file=pdfs[submission_id], title=texify(paper_title)), file=book_file)


    # Write the volume-level bib with all the entries
    dest_bib = 'proceedings/cdrom/{}-{}.bib'.format(venue, year)
    with open(dest_bib, 'w') as whole_bib:
        print('\n'.join(final_bibs), file=whole_bib)
        print('CREATED', dest_bib)

    # Copy over the volume-level PDF
    full_pdf_file = 'pdf/{}_{}.pdf'.format(venue, year)
    dest_pdf = dest_bib.replace('bib', 'pdf')
    print('COPYING', full_pdf_file, '->', dest_pdf, file=sys.stderr)
    copy(full_pdf_file, dest_pdf)

## Make everything

- Does not make front matter and combined proceedings
- Run again after making them in latex to generate their bibtex

In [14]:
metadata =collect_track_metadata()

submissions, accepted, abstracts, pdfs  = collect_submissions_and_acceptances( decisions, metadata )

final_bibs = render_bibtex_and_track_assigned_pdf(metadata, submissions, accepted, abstracts, pdfs)

make_book(metadata, accepted, pdfs, final_bibs)

Found  90  submitted files in  long-papers
Found  146  submitted files in  short-papers
Found  166  submitted files in  posters
Found  179  submitted files in  doctoral-consortium
Found  185  submitted files in  industry-track
Found  192  submitted files in  workshop-tutorials
Found  52  accepted files in  long-papers
Found  89  accepted files in  short-papers
Found  99  accepted files in  posters
Found  108  accepted files in  doctoral-consortium
Found  112  accepted files in  industry-track
Found  118  accepted files in  workshop-tutorials
Found  90 abstracts in  <csv.DictReader object at 0x7fca0e52af50>
Found  146 abstracts in  <csv.DictReader object at 0x7fca0e52add0>
Found  166 abstracts in  <csv.DictReader object at 0x7fca0e574850>
Found  179 abstracts in  <csv.DictReader object at 0x7fca0e5747d0>
Found  185 abstracts in  <csv.DictReader object at 0x7fca0e574850>
Found  192 abstracts in  <csv.DictReader object at 0x7fca0e5747d0>
CREATED proceedings/cdrom/bib/2022.EDM-front.0.bib


COPYING long papers meta -> proceedings/meta
COPYING pdf/EDM_2022_frontmatter.pdf -> proceedings/cdrom/pdf/2022.EDM-front.0.pdf
COPYING long-papers/pdf/EDM_2022_paper_7.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.1.pdf
COPYING long-papers/pdf/EDM_2022_paper_10.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.2.pdf
COPYING long-papers/pdf/EDM_2022_paper_12.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.3.pdf
COPYING long-papers/pdf/EDM_2022_paper_18.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.4.pdf
COPYING long-papers/pdf/EDM_2022_paper_21.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.5.pdf
COPYING long-papers/pdf/EDM_2022_paper_22.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.6.pdf
COPYING long-papers/pdf/EDM_2022_paper_35.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.7.pdf
COPYING long-papers/pdf/EDM_2022_paper_37.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.8.pdf
COPYING long-papers/pdf/EDM_2022_paper_51.pdf -> proceedings/cdrom/pdf/2022.EDM-long-pape

CREATED proceedings/cdrom/bib/2022.EDM-long-papers.15.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.16.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.17.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.18.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.19.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.20.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.21.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.22.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.23.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.24.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.25.bib
CREATED proceedings/cdrom/bib/2022.EDM-long-papers.26.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.27.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.28.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.29.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.30.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.31.bib


COPYING long-papers/pdf/EDM_2022_paper_80.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.17.pdf
COPYING long-papers/pdf/EDM_2022_paper_84.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.18.pdf
COPYING long-papers/pdf/EDM_2022_paper_94.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.19.pdf
COPYING long-papers/pdf/EDM_2022_paper_108.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.20.pdf
COPYING long-papers/pdf/EDM_2022_paper_109.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.21.pdf
COPYING long-papers/pdf/EDM_2022_paper_125.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.22.pdf
COPYING long-papers/pdf/EDM_2022_paper_133.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.23.pdf
COPYING long-papers/pdf/EDM_2022_paper_141.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.24.pdf
COPYING long-papers/pdf/EDM_2022_paper_148.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.25.pdf
COPYING long-papers/pdf/EDM_2022_paper_155.pdf -> proceedings/cdrom/pdf/2022.EDM-long-papers.26.pdf
COP

CREATED proceedings/cdrom/bib/2022.EDM-short-papers.32.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.33.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.34.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.35.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.36.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.37.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.38.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.39.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.40.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.41.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.42.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.43.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.44.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.45.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.46.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.47.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.48.b

COPYING long-papers/pdf/EDM_2022_paper_43.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.35.pdf
COPYING short-papers/pdf/EDM_2022_paper_45.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.36.pdf
COPYING short-papers/pdf/EDM_2022_paper_52.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.37.pdf
COPYING long-papers/pdf/EDM_2022_paper_59.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.38.pdf
COPYING long-papers/pdf/EDM_2022_paper_69.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.39.pdf
COPYING long-papers/pdf/EDM_2022_paper_72.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.40.pdf
COPYING long-papers/pdf/EDM_2022_paper_74.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.41.pdf
COPYING short-papers/pdf/EDM_2022_paper_82.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.42.pdf
COPYING long-papers/pdf/EDM_2022_paper_93.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.43.pdf
COPYING long-papers/pdf/EDM_2022_paper_97.pdf -> proceedings/cdrom/pdf/2022.EDM-short-papers.44.p

CREATED proceedings/cdrom/bib/2022.EDM-short-papers.54.bib
CREATED proceedings/cdrom/bib/2022.EDM-short-papers.55.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.56.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.57.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.58.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.59.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.60.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.61.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.62.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.63.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.64.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.65.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.66.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.67.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.68.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.69.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.70.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.71.bib
CREATED proceeding

COPYING long-papers/pdf/EDM_2022_paper_6.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.56.pdf
COPYING long-papers/pdf/EDM_2022_paper_15.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.57.pdf
COPYING short-papers/pdf/EDM_2022_paper_17.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.58.pdf
COPYING short-papers/pdf/EDM_2022_paper_23.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.59.pdf
COPYING long-papers/pdf/EDM_2022_paper_26.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.60.pdf
COPYING short-papers/pdf/EDM_2022_paper_33.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.61.pdf
COPYING short-papers/pdf/EDM_2022_paper_39.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.62.pdf
COPYING short-papers/pdf/EDM_2022_paper_48.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.63.pdf
COPYING short-papers/pdf/EDM_2022_paper_57.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.64.pdf
COPYING long-papers/pdf/EDM_2022_paper_61.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.65.pdf
COPYING short-papers/pdf/EDM_2022_paper_67.pd

CREATED proceedings/cdrom/bib/2022.EDM-posters.73.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.74.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.75.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.76.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.77.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.78.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.79.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.80.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.81.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.82.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.83.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.84.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.85.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.86.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.87.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.88.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.89.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.90.bib
CREATED proceedings/cdrom/bi

COPYING short-papers/pdf/EDM_2022_paper_103.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.75.pdf
COPYING short-papers/pdf/EDM_2022_paper_110.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.76.pdf
COPYING long-papers/pdf/EDM_2022_paper_115.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.77.pdf
COPYING short-papers/pdf/EDM_2022_paper_119.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.78.pdf
COPYING short-papers/pdf/EDM_2022_paper_124.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.79.pdf
COPYING long-papers/pdf/EDM_2022_paper_132.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.80.pdf
COPYING long-papers/pdf/EDM_2022_paper_136.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.81.pdf
COPYING short-papers/pdf/EDM_2022_paper_138.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.82.pdf
COPYING long-papers/pdf/EDM_2022_paper_157.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.83.pdf
COPYING short-papers/pdf/EDM_2022_paper_162.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.84.pdf
COPYING short-papers/pdf/EDM_2022_

CREATED proceedings/cdrom/bib/2022.EDM-posters.92.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.93.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.94.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.95.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.96.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.97.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.98.bib
CREATED proceedings/cdrom/bib/2022.EDM-posters.99.bib
CREATED proceedings/cdrom/bib/2022.EDM-doctoral-consortium.100.bib
CREATED proceedings/cdrom/bib/2022.EDM-doctoral-consortium.101.bib
CREATED proceedings/cdrom/bib/2022.EDM-doctoral-consortium.102.bib
CREATED proceedings/cdrom/bib/2022.EDM-doctoral-consortium.103.bib
CREATED proceedings/cdrom/bib/2022.EDM-doctoral-consortium.104.bib
CREATED proceedings/cdrom/bib/2022.EDM-doctoral-consortium.105.bib
CREATED proceedings/cdrom/bib/2022.EDM-doctoral-consortium.106.bib
CREATED proceedings/cdrom/bib/2022.EDM-doctoral-consortium.107.bib
CREATED proceedings/cdrom/bib/20

COPYING posters/pdf/EDM_2022_paper_214.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.94.pdf
COPYING posters/pdf/EDM_2022_paper_215.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.95.pdf
COPYING posters/pdf/EDM_2022_paper_220.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.96.pdf
COPYING posters/pdf/EDM_2022_paper_221.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.97.pdf
COPYING posters/pdf/EDM_2022_paper_222.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.98.pdf
COPYING posters/pdf/EDM_2022_paper_223.pdf -> proceedings/cdrom/pdf/2022.EDM-posters.99.pdf
COPYING doctoral-consortium/pdf/EDM_2022_paper_34.pdf -> proceedings/cdrom/pdf/2022.EDM-doctoral-consortium.100.pdf
COPYING doctoral-consortium/pdf/EDM_2022_paper_201.pdf -> proceedings/cdrom/pdf/2022.EDM-doctoral-consortium.101.pdf
COPYING doctoral-consortium/pdf/EDM_2022_paper_209.pdf -> proceedings/cdrom/pdf/2022.EDM-doctoral-consortium.102.pdf
COPYING doctoral-consortium/pdf/EDM_2022_paper_212.pdf -> proceedings/cdrom/pdf/2022.EDM-doctoral-

## Stamp DOI

- Doi have been reserved elsewhere and written to bibtex
- We create citation block for each paper and add it to left corner of first page

In [34]:
# our pdf does not have plaintext
import os
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject
from PyPDF2.generic import NameObject


def replace_text(content, replacements = dict()):
    lines = content.splitlines()
    #print(lines)

    result = ""
    in_text = False

    for line in lines:
        if line == "BT":
            in_text = True

        elif line == "ET":
            in_text = False

        elif in_text:
            cmd = line[-2:]
            if cmd.lower() == 'tj':
                replaced_line = line
                for k, v in replacements.items():
                    replaced_line = replaced_line.replace(k, v)
                    print(replaced_line)
                result += replaced_line + "\n"
            else:
                result += line + "\n"
            continue

        result += line + "\n"

    return result


def process_data(object, replacements):
    data = object.getData()
    #print(data )
    decoded_data = data.decode("utf-8", "ignore")#'utf-8')

    replaced_data = replace_text(decoded_data, replacements)

    encoded_data = replaced_data.encode("utf-8", "ignore")#'utf-8')
    if object.decodedSelf is not None:
        object.decodedSelf.setData(encoded_data)
    else:
        object.setData(encoded_data)

def replace_warning(path ):

    in_file = path
    filename_base = in_file.replace(os.path.splitext(in_file)[1], "")

    # Provide replacements list that you need here
    replacements = { 'Do not delete, move, or resize this block.' : 'fart'}
#need to be filled in with reference information.' If the paper is accepted, this block will
    pdf = PdfFileReader(in_file)
    writer = PdfFileWriter()

    for page_number in range(0, pdf.getNumPages()):

        page = pdf.getPage(page_number)
        contents = page.getContents()

        if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
            process_data(contents, replacements)
        elif len(contents) > 0:
            for obj in contents:
                if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
                    streamObj = obj.getObject()
                    process_data(streamObj, replacements)
                    
        page[NameObject("/Contents")] = contents.decodedSelf
        writer.addPage(page)

    with open(filename_base + ".result.pdf", 'wb') as out_file:
        writer.write(out_file)
        
replace_warning('/z/aolney/reviews/conferences/edm/2022/publications-chair/easy2edm/workshop-tutorials/pdf/uncompressed.pdf' )
# accepted.sort(key=paper_cmp_key)
# accepted

ValueError: value must be PdfObject