# easy2edm

This notebook creates proceedings for the [International Conference on Educational Data Mining](https://educationaldatamining.org/conferences/) using reviewing data from [EasyChair](https://easychair.org).

See the [README](README.md) for how to download and structure the data.

In [32]:
# easy2acl.py - Convert data from EasyChair for use with ACLPUB
#
# Original Author: Nils Blomqvist
# Forked/modified by: Asad Sayeed
# Further modifications and docs (for 2019 Anthology): Matt Post
# Index for LaTeX book proceedings: Mehdi Ghanimifard and Simon Dobnik
# Modified for EDM by Andrew Olney 
# Please see the documentation in the README file at http://github.com/acl-org/easy2acl.

import os
import re
import sys

from csv import DictReader
from glob import glob
from shutil import copy, rmtree
from unicode_tex import unicode_to_tex
from pybtex.database import BibliographyData, Entry
from PyPDF2 import PdfFileReader

# Specify conference tracks  here
tracks =  ['doctoral-consortium','industry-track','long-papers','posters','short-papers']

# specify decision types here
decisions = {
    "Accept in current track" : None,
    "Accept+move to short" :"short-papers",
    "Accept+move to posters":"posters",
    "ACCEPT":None
}

def texify(string):
    """Return a modified version of the argument string where non-ASCII symbols have
    been converted into LaTeX escape codes.

    """
    return ' '.join(map(unicode_to_tex, string.split())).replace(r'\textquotesingle', "'")

def get_track_metadata(directory):
    #,----
    #| Metadata
    #`----
    metadata = { 'chairs': [] }
    with open(os.path.join(directory, 'meta')) as metadata_file:
        for line in metadata_file:
            key, value = line.rstrip().split(maxsplit=1)
            if key == 'chairs':
                metadata[key].append(value)
            else:
                metadata[key] = value

    for key in 'abbrev volume title shortbooktitle booktitle month year location publisher chairs'.split():
        if key not in metadata:
            print('Fatal: missing key "{}" from "meta" file'.format(key))
            print("Please see the documentation at https://acl-org.github.io/ACLPUB/anthology.html.")
            sys.exit(1)

    for key in "bib_url volume_name short_booktitle type".split():
        if key in metadata:
            print('Fatal: bad key "{}" in the "meta" file'.format(key))
            print("Please see the documentation at https://acl-org.github.io/ACLPUB/anthology.html.")
            sys.exit(1)

    venue = metadata["abbrev"]
    volume_name = metadata["volume"]
    year = metadata["year"]
    return metadata

def collect_track_metadata():
    metadata ={}
    for d in tracks :
        metadata[d] = get_track_metadata(d)
    return metadata

# Across all tracks, build a dictionary of submissions (which has author 
# information). We do this across tracks because some submissions have 
# decisions that move them to other tracks 

def collect_submissions_and_acceptances( decision_map, metadata ):
    submissions = {}
    for d in tracks :
        with open(os.path.join(d, 'submissions')) as submissions_file:
            for line in submissions_file:
                entry = line.rstrip().split("\t")
                submission_id = entry[0]
                authors = entry[1].replace(' and', ',').split(', ')
                title = entry[2]

                submissions[submission_id] = (title, authors)
            print("Found ", len(submissions), " submitted files in ", d)

    #
    # Append each accepted submission, as a tuple, to the 'accepted' list.
    # Order in this file is used to determine program order.
    #
    accepted = []
    for d in tracks :
        with open(os.path.join(d, 'accepted')) as accepted_file:
            for line in accepted_file:
                entry = line.rstrip().split("\t")
                # modified here to filter out the rejected files rather than doing
                # that by hand
                #if entry[-1] == 'ACCEPT':
                if entry[-1] in decision_map:
                    #print(d)
                    submission_id = entry[0]
                    title = entry[1]
                    authors = submissions[submission_id][1]
                    # if we defined an explicit mapping, use it
                    if decision_map[ entry[-1] ]:
                        track = decision_map[ entry[-1] ]
                    # otherwise we should place in current track
                    else:
                        track = d

                    accepted.append((submission_id, title, authors, track))
            print("Found ", len(accepted), " accepted files in ", d)

    # Read abstracts
    abstracts = {}
    for d in tracks :
        if os.path.exists(os.path.join(d, 'submission.csv')):
            with open(os.path.join(d, 'submission.csv')) as csv_file:
                d = DictReader(csv_file)
                for row in d:
                    abstracts[row['#']] = row['abstract']
            print('Found ', len(abstracts), 'abstracts in ',d)
        else:
            print('No abstracts available.')

    #
    # Find all relevant PDFs
    #
    venue = metadata['long-papers']["abbrev"]
    year = metadata['long-papers']["year"]
    booktitle = metadata['long-papers']['booktitle']
    chairs = metadata['long-papers']['chairs']
    
    # The PDF of the full proceedings
    full_pdf_file = 'pdf/{}_{}.pdf'.format(venue, year)
    if not os.path.exists(full_pdf_file):
        print("Fatal: could not find full volume PDF '{}'".format(full_pdf_file))
        sys.exit(1)

    # The PDF of the frontmatter
    frontmatter_pdf_file = 'pdf/{}_{}_frontmatter.pdf'.format(venue, year)
    if not os.path.exists(frontmatter_pdf_file):
        print("Fatal: could not find frontmatter PDF file '{}'".format(frontmatter_pdf_file))
        sys.exit(1)

    # File locations of all PDFs (seeded with PDF for frontmatter)
    pdfs = { '0': frontmatter_pdf_file }
    for d in tracks :
        for pdf_file in glob(os.path.join(d,'pdf/{}_{}_paper_*.pdf'.format(venue, year))):
            submission_id = pdf_file.split('_')[-1].replace('.pdf', '')
            pdfs[submission_id] = pdf_file

    # List of accepted papers (seeded with frontmatter)
    accepted.insert(0, ('0', booktitle, chairs))
    return (submissions, accepted, abstracts, pdfs)

In [35]:
metadata =collect_track_metadata()

submissions, accepted, abstracts, pdfs = collect_submissions_and_acceptances( decisions, metadata )



Found  13  submitted files in  doctoral-consortium
Found  19  submitted files in  industry-track
Found  109  submitted files in  long-papers
Found  129  submitted files in  posters
Found  185  submitted files in  short-papers
Found  9  accepted files in  doctoral-consortium
Found  13  accepted files in  industry-track
Found  65  accepted files in  long-papers
Found  75  accepted files in  posters
Found  112  accepted files in  short-papers
Found  13 abstracts in  <csv.DictReader object at 0x7f016611f790>
Found  19 abstracts in  <csv.DictReader object at 0x7f016611f750>
Found  109 abstracts in  <csv.DictReader object at 0x7f016611fed0>
Found  129 abstracts in  <csv.DictReader object at 0x7f016611f890>
Found  185 abstracts in  <csv.DictReader object at 0x7f016611fed0>
