In [215]:
import sys # In order to print to stderr
from collections import namedtuple
from functools import reduce
from itertools import zip_longest

all_files = ["2014-05-01.data", "2014-08-01.data", "2014-08-02.data", "2014-08-03.data"]
# Bunch of files on S3

query_dataset = ["2014-05-01.data", "2014-08-01.data", "2014-08-02.data", "2014-08-03.data", "test"]
# We want to query these files

# We can only query so many files concurrently
# They have a relative cost per slot
class Slot:
  pass

slot1 = Slot()
slot1.name = 'slot1'
slot1.cost_map = {"2014-05-01.data" : 1,  # this one is cached locally
                  "2014-08-01.data" : 17, # this one is not cached
                  "2014-08-02.data" : 20,
                  "2014-08-03.data" : 2}

# Why the hell didn't I think to create another slot live?!?!
slot2 = Slot()
slot2.name = 'slot2'
slot2.cost_map = {"2014-05-01.data" : 2,
                  "2014-08-01.data" : 17,
                  "2014-08-02.data" : 20,
                  "2014-08-03.data" : 1}

slot3 = Slot()
slot3.name = 'slot3'
slot3.cost_map = {"2014-05-01.data" : 1,
                  "2014-08-01.data" : 14,
                  "2014-08-02.data" : 1,
                  "2014-08-03.data" : 20}

# This takes a list of slots and files...
# This approach assumes that the cost for figuring out how to access the 
# files is much higher than the cost for accessing them stupidly.
# This is almost always true, right?
def schedule(slots, query_dataset):
    # produce a map from slot name -> [files...] to process on this slot
    results    = dict((slot.name, []) for slot in slots)
    # Named tuples are just much more readable, so candidates are represented this way.
    Candidate  = namedtuple('Candidate', ['slot', 'file', 'cost'])
    # We have a big array of candidates.
    candidates = []

    for f in query_dataset: # this_file is used to avoid confusion in the print call
        # Just continue to the next file in query_dataset if this_file doesn't exist
        # I totally airballed this check live.
        if not f in all_files:
            print("File not found: %s" % f, file=sys.stderr)
            continue # Just move onto the next file
        
        for slot in slots:
            for cost in slot.cost_map:
                if f in slot.cost_map: # A possible edge case that I also airballed live.
                    c = Candidate(slot.name, f, slot.cost_map[f])
                    if c not in candidates: # avoid duplicate entries.
                        candidates.append(c)
    
    candidates = sorted(candidates, key=lambda c: (c.slot, c.file, c.cost))

    # I don't like this.
    for f, c in zip_longest(query_dataset, candidates):
        if c.file == f and c.cost == 1:
            if f in results:
                candidates.remove(c)
            if f not in results:
                results[slot.name].append(f)
    
    for c in candidates:
        print(c)
    
    return results
    
sched = schedule([slot1, slot2, slot3], query_dataset)

print("Results")
for key, value in sched.items():
    print("%s:" % key)
    for item in value:
        print("\t%s" % item)

Candidate(slot='slot1', file='2014-05-01.data', cost=1)
Candidate(slot='slot1', file='2014-08-01.data', cost=17)
Candidate(slot='slot1', file='2014-08-02.data', cost=20)
Candidate(slot='slot1', file='2014-08-03.data', cost=2)
Candidate(slot='slot2', file='2014-05-01.data', cost=2)
Candidate(slot='slot2', file='2014-08-01.data', cost=17)
Candidate(slot='slot2', file='2014-08-02.data', cost=20)
Candidate(slot='slot2', file='2014-08-03.data', cost=1)
Candidate(slot='slot3', file='2014-05-01.data', cost=1)
Candidate(slot='slot3', file='2014-08-01.data', cost=14)
Candidate(slot='slot3', file='2014-08-02.data', cost=1)
Candidate(slot='slot3', file='2014-08-03.data', cost=20)
Results
slot1:
slot2:
slot3:
	2014-05-01.data


File not found: test
