In [1]:
import random

import numpy as np
import pandas as pd

In [2]:
extractors = pd.read_csv("NMSC.extractors.csv", encoding='utf-8')
extractors.head()

Unnamed: 0,Extractor,PMID,Title,Year,Author,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31
0,gpadam (Gaelen Adam),18717680,Fractionated 5-aminolaevulinic acid-photodynam...,2008,"Mosterd K., Thissen MR., Nelemans P., Kellener...",,,,,,...,,,,,,,,,,
1,vlangber (Valerie Langberg),21242584,Comparison of imiquimod 5% cream versus radiot...,2011,"Garcia-Martin E., Gil-Arribas LM., Idoipe M., ...",,,,,,...,,,,,,,,,,
2,vlangber (Valerie Langberg),17610993,"A phase III, randomized, open label study to e...",2007,"Eigentler TK., Kamin A., Weide BM., Breuninger...",,,,,,...,,,,,,,,,,
3,amdrucker (Aaron Drucker),1430394,The effect of intralesional 5-Fluorouracil The...,1992,Orenberg EK et al,,,,,,...,,,,,,,,,,
4,gpadam (Gaelen Adam),22385074,Imiquimod 5% cream as pretreatment of Mohs mic...,2012,"van der Geer S., Martens J., van Roij J., Bran...",,,,,,...,,,,,,,,,,


In [3]:
people = set([p.split(" ")[0] for p in extractors["Extractor"].unique()])
# add Tom
people.add("ttrikalin")
print(people) 

{'famousta', 'agazula', 'vlangber', 'ttrikalin', 'gpadam', 'bryantadmin', 'amdrucker'}


In [4]:
extractors.loc[25, "PMID"]

'12224977-12 week'

In [5]:
extractors.loc[25, "PMID"] = '12224977' # rename the weird 12224977-12 week to just the PMID
papers = extractors["PMID"].unique()
print(papers)

['18717680' '21242584' '17610993' '1430394' '22385074' '9218740' '8996264'
 '298425' '2107219' '2229497' '3514075' '2383027' '8977678' '10570388'
 '10940063' '11298545' '11312429' '17451581' nan '24102369' '19010733'
 '12653747' '23683751' '12452875' '14732655' '12224977' '7999616'
 '12224977 - 6 week' '12196749' '15097956' '15606733' '16785375' '25899562'
 '25047438' '26353121' '20546215' '23566745' '19018814' '25981810'
 '22511036' '27067393' '15888150' '16393600' '16713457' '17573890'
 '18624836' '20064185' '26551044' '16865869' '15377354' '26157307'
 '24903544' '8708151' '24332516']


## Assigning papers to individuals
Sampling desiderata: 
* Each paper should be assessed once with RR and once without (so, obviously, twice in all).
* Each reviewer should get about the same number of papers using and not using RR.

In [6]:
def who_already_reviewed(paper_id):
    try:
        already_reviewed = extractors[extractors["PMID"]==paper_id]["Extractor"].values[0].split(" ")[0]
        return set([already_reviewed])
    except:
        print("warning! could not find existing record for paper: {0}".format(paper_id))
        return set([])
    
class Assignment:
    def __init__(self, paper_id, person_id, use_rr):
        self.paper_id  = paper_id
        self.person_id = person_id
        self.rr        = use_rr
    
    def __str__(self):
        return "{0}, {1}, {2}".format(self.paper_id, self.person_id, self.rr)
        
def assign_reviewers_to_papers(paper_ids, person_ids, max_attempts=5000):
    ''' 
    assign all papers in paper_ids to folks in person_ids, 
       while respecting constraints stated above.
    '''
    expected_work = (2*len(paper_ids))/(len(person_ids)-1) 
    print("{0} workers; {1} papers".format(len(person_ids), len(paper_ids)))
    print("expected workload per person: {0}".format(expected_work))

    def check_assignments(assignments, plus_minus_workload=3, plus_minus_rr_frac=.15):
        workloads, with_rr = {}, {}
        for p in person_ids: 
            workloads[p] = 0
            with_rr[p] = 0
        
        for assignment in assignments:
            workloads[assignment.person_id] += 1
            if assignment.rr:
                with_rr[assignment.person_id] += 1
        
        ####################################################################
        # no one should receive > +/-plus_minus_workload the expected load #
        ####################################################################
        if (expected_work - min(workloads.values())) > plus_minus_workload or (
                    max(workloads.values()) - expected_work) > plus_minus_workload:
            print("failure due to workload imbalance!")
            print(workloads)
            
            return False 

        ###########################################################
        # frac of rr/non rr should be roughly balanced per person #
        ###########################################################
        for person in with_rr:
            cur_frac = with_rr[person]/workloads[person]
            if not ((0.5 - plus_minus_rr_frac) < cur_frac < (0.5 + plus_minus_rr_frac)):
                print("failure due to w/rr fraction!")
                print("{0} had {1}% assignments using RR".format(person, cur_frac))
                return False 
        
        #import pdb; pdb.set_trace()
        print("success!")
        print(workloads)
  
        return True
        
    # use a really stupid generate-and-check approach
    count = 1
    while True:
        print("on attempt {0}".format(count))
        assignments = []
        for paper_id in paper_ids:
            # generate two assignments
            #print(paper_id)
            cur_people_ids = random.sample(people - who_already_reviewed(paper_id), 2)
            # the first assignment will be with RR. 
            # so just in case...
            random.shuffle(cur_people_ids)
            for i, cur_person_id in enumerate(cur_people_ids):
                cur_assignment = Assignment(paper_id, cur_person_id, i<1)
                assignments.append(cur_assignment)
            
        if check_assignments(assignments):
            return assignments
        
        count += 1
        
        if count > max_attempts:
            print("failed to find assignment in {0} tries".format(max_attempts))
            break 
            

In [8]:
# This is a secondary publication for a study that has two. 
# The primary study PMID is 14732655. We only assign the 
# latter below (I have removed this one from the list.)
who_already_reviewed("17875873")



set()

In [9]:
papers_for_assignment = ["11298545", "11312429", "12224977", "12452875", "12653747", "14732655", "15097956", 
                         "12196749", "15377354", "15606733", "15888150", "16393600", "16713457", "16785375"]
assignments = assign_reviewers_to_papers(papers_for_assignment, people)

7 workers; 14 papers
expected workload per person: 4.666666666666667
on attempt 1
failure due to w/rr fraction!
ttrikalin had 1.0% assignments using RR
on attempt 2
failure due to w/rr fraction!
ttrikalin had 0.8333333333333334% assignments using RR
on attempt 3
failure due to w/rr fraction!
gpadam had 0.25% assignments using RR
on attempt 4
failure due to w/rr fraction!
ttrikalin had 1.0% assignments using RR
on attempt 5
failure due to workload imbalance!
{'ttrikalin': 3, 'famousta': 6, 'gpadam': 5, 'agazula': 4, 'bryantadmin': 4, 'amdrucker': 1, 'vlangber': 5}
on attempt 6
failure due to w/rr fraction!
famousta had 0.3333333333333333% assignments using RR
on attempt 7
failure due to w/rr fraction!
gpadam had 0.16666666666666666% assignments using RR
on attempt 8
failure due to workload imbalance!
{'ttrikalin': 6, 'famousta': 6, 'gpadam': 1, 'agazula': 2, 'bryantadmin': 3, 'amdrucker': 4, 'vlangber': 6}
on attempt 9
failure due to workload imbalance!
{'ttrikalin': 4, 'famousta': 7, '

In [None]:
assignments = assign_reviewers_to_papers(papers, people)
all_people = people
workloads, with_rr = {}, {}
for p in all_people: 
    workloads[p] = 0
    with_rr[p] = 0

for assignment in assignments:
    workloads[assignment.person_id] += 1
    if assignment.rr:
        with_rr[assignment.person_id] += 1


In [None]:
for a in assignments:
    print(a)

In [10]:
out_str = ["PMID, reviewer, use RR?"]
for a in assignments:
    out_str.append(str(a))
    
with open("assignments.csv", 'w') as outf:
    outf.write("\n".join(out_str))
