From c4f2a4e353ef776723d36a9bc59886308593b02a Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Wed, 5 Mar 2014 16:24:47 -0500
Subject: [PATCH 1/2] Added scripts for generating MTurk batches

---
 scripts/make-mturk-batch.pl       |  31 ++++++
 scripts/ranking_task.py           |  85 +++++++++++++++
 scripts/visualize_ranking_task.py | 138 +++++++++++++++++++++++
 scripts/wmt_ranking_task.py       | 175 ++++++++++++++++++++++++++++++
 4 files changed, 429 insertions(+)
 create mode 100644 scripts/make-mturk-batch.pl
 create mode 100644 scripts/ranking_task.py
 create mode 100755 scripts/visualize_ranking_task.py
 create mode 100755 scripts/wmt_ranking_task.py
diff --git a/scripts/make-mturk-batch.pl b/scripts/make-mturk-batch.pl
new file mode 100644
index 0000000..6a8f284
--- /dev/null
+++ b/scripts/make-mturk-batch.pl
@@ -0,0 +1,31 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+if (@ARGV != 3) {
+  print "Usage: make-batch.sh BATCHNO SOURCE TARGET\n";
+  exit;
+}
+my ($batchno,$source,$target) = @ARGV;
+
+my $pair="$source-$target";
+mkdir($pair) unless -d $pair;
+
+my $outfile = "$pair/$pair-batch$batchno.txt";
+die "Cowardly refusing to create batch $outfile (already exists)" if -e $outfile;
+
+my %langs = (
+	en => 'eng',
+	ru => 'rus',
+	cs => 'cze',
+	fr => 'fre',
+	de => 'deu',
+	es => 'spa' );
+
+my $plaindir = "$ENV{HOME}/expts/wmt13/data/wmt13-data/plain";
+
+my $cmd = "python ~/code/Appraise/scripts/wmt_ranking_task.py $plaindir/sources/newstest2013-src.$source $plaindir/references/newstest2013-ref.$target $plaindir/system-outputs/newstest2013/$pair/newstest2013.$pair.* -source $langs{$source} -target $langs{$target} -no-sequential -controls controls/$pair/controls.txt -control_prob 0.5 -redundancy 0 > $outfile";
+
+#print "$cmd\n";
+system($cmd);
diff --git a/scripts/ranking_task.py b/scripts/ranking_task.py
new file mode 100644
index 0000000..6b443c3
--- /dev/null
+++ b/scripts/ranking_task.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+
+class RankingTask:
+
+    def __init__(self):
+        self.id = None
+        self.source = None
+        self.reference = None
+        self.system_names = None
+        self.system_outputs = None
+
+    def __init__(self, id, source, ref, names, outputs):
+        self.id = id
+        self.source = source
+        self.reference = ref
+        self.system_names = names
+        self.system_outputs = outputs
+
+    def attr(self):
+        return ''
+
+    def xml(self, indent=4):
+        str  = '\n    <seg%s>' % (self.attr())
+        str += '\n      <source id="%d">%s</source>' % (self.id, self.source)
+        str += '\n      <reference>%s</reference>' % (self.reference)
+        for i in range(len(self.system_names)):
+            str += '\n      <translation system="%s">%s</translation>' % (self.system_names[i], self.system_outputs[i])
+        str += '\n    </seg>'
+
+        return str
+
+class Control(RankingTask):
+    """A Control is a RankingTask that happens to have been filled out."""
+
+    @staticmethod
+    def load(filename):
+        controls = []
+        control = None
+
+        fh = open(filename)
+        for line in fh:
+            line = line.rstrip()
+            if line.startswith('SENTENCE '):
+                control = Control()
+                control.id = int(line.split()[-1])
+            elif line.startswith('SOURCE '):
+                control.source = ' '.join(line.split()[1:])
+            elif line.startswith('REFERENCE '):
+                control.reference = ' '.join(line.split()[1:])
+            elif line.startswith('SYSTEMS '):
+                control.system_names = line.split()[1:]
+                control.system_outputs = [fh.next().rstrip() for x in control.system_names]
+                control.ranks = [fh.next().rstrip().split() for x in control.system_names]
+                controls.append(control)
+
+        return controls
+
+    def __init__(self):
+        self.ranks = None
+
+    def __str__(self):
+        s = 'SENTENCE %d\n' % (self.id)
+        s += 'SCORE: %d\n' % (self.score())
+        s += 'SOURCE %s\n' % (self.source)
+        s += 'REFERENCE %s\n' % (self.reference)
+        s += 'SYSTEMS %s\n' % (' '.join(self.system_names))
+        for output in self.system_outputs:
+            s += output + '\n'
+        for ranks in self.ranks:
+            s += ' '.join(ranks) + '\n'
+
+        return s
+
+    def attr(self):
+        return " control='true'"
+
+    def score(self):
+        """Returns the score of a control, which is the sum of the absolute values of the differences between opposite ranks."""
+
+        score = 0
+        for i,row in enumerate(self.ranks):
+            for j in range(i+1, len(row)):
+                score += abs(int(self.ranks[i][j]) - int(self.ranks[j][i]))
+
+        return score
diff --git a/scripts/visualize_ranking_task.py b/scripts/visualize_ranking_task.py
new file mode 100755
index 0000000..32dfbdd
--- /dev/null
+++ b/scripts/visualize_ranking_task.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Project: Appraise evaluation system Author: Matt Post <post@cs.jhu.edu>
+
+This script allows you to visualize an individual ranking task against the researcher consensus.
+
+"""
+
+import os
+import sys
+import math
+import random
+import hashlib
+import argparse
+from collections import defaultdict
+from csv import DictReader
+from itertools import combinations
+from ranking_task import RankingTask,Control
+
+PARSER = argparse.ArgumentParser(description="Visualize a ranking task.")
+PARSER.add_argument('-consensus', type=str, default=None, help='file containing results you trust')
+PARSER.add_argument('-judge', type=str, default='researcher', help='prefix that judge IDs must match')
+
+def read_file(filename, list):
+    """Read in a file to an array."""
+    for line in open(filename):
+        list.append(line.rstrip())
+
+def get_rankings(row):
+    """Takes a DictReader row and computes all the rankings."""
+    rankings = {}
+    for pair in combinations(range(5),2):
+        rank1 = int(row.get('system%drank' % (pair[0] + 1)))
+        rank2 = int(row.get('system%drank' % (pair[1] + 1)))
+        sys1 = row.get('system%dId' % (pair[0] + 1))
+        sys2 = row.get('system%dId' % (pair[1] + 1))
+        if rank1 < rank2:
+            syspair = '%s < %s' % (sys1, sys2)
+            rankings[syspair] = 1
+        elif rank1 > rank2:
+            syspair = '%s < %s' % (sys2, sys1)
+            rankings[syspair] = 1
+
+    return rankings
+
+if __name__ == "__main__":
+    args = PARSER.parse_args()
+
+    LANGS = { 'Czech': 'cs',
+              'Russian': 'ru',
+              'German': 'de',
+              'Spanish': 'es',
+              'English': 'en',
+              'French': 'fr' }
+
+    # Read source, reference, and system sentences
+    sources = defaultdict(dict)
+    refs = {}
+    systems = {}
+    for pair in 'cs-en es-en fr-en de-en ru-en en-cs en-es en-fr en-de en-ru'.split(' '):
+        source,target = pair.split('-')
+        sources[pair] = []
+        refs[pair] = []
+        systems[pair] = defaultdict(list)
+        dir = '/Users/post/expts/wmt13/data/maxlen30/%s' % (pair)
+        read_file('%s/newstest2013-src.%s' % (dir, source), sources[pair])
+        read_file('%s/newstest2013-ref.%s' % (dir, target), refs[pair])
+        for system in os.listdir(dir):
+            if system.startswith('newstest2013.%s' % (pair)):
+                read_file('%s/%s' % (dir, system), systems[pair][system])
+
+    # Read in the controls
+    RANKINGS = {}
+    if args.consensus is not None:
+        # print >> sys.stderr, 'will read from', args.consensus
+        for row in DictReader(open(args.consensus)):
+            if row.get('srcIndex') is None:
+                print >> sys.stderr, 'bad line', row
+                continue
+            if not row.get('judgeId').startswith(args.judge):
+                continue
+            sentno = int(row.get('srcIndex'))
+            langpair = '%s-%s' % (LANGS[row.get('srclang')], LANGS[row.get('trglang')])
+            if not RANKINGS.has_key(langpair):
+                RANKINGS[langpair] = {}
+            if not RANKINGS[langpair].has_key(sentno):
+                RANKINGS[langpair][sentno] = {}
+            this_rankings = get_rankings(row)
+            for key in this_rankings.keys():
+                RANKINGS[langpair][sentno][key] = RANKINGS[langpair][sentno].get(key,0) + 1
+
+    # Read in input
+    for line in sys.stdin:
+        # Skip the header if seen
+        if line.startswith('srclang'):
+            continue
+
+        # Hard-code this, so the header isn't required on STDIN
+        srclang,trglang,srcIndex,documentId,segmentId,judgeId,system1Number,system1Id,system2Number,system2Id,system3Number,system3Id,system4Number,system4Id,system5Number,system5Id,system1rank,system2rank,system3rank,system4rank,system5rank = line.rstrip().split(',')
+
+        srcIndex = int(srcIndex)
+                  
+        pair = '%s-%s' % (LANGS[srclang], LANGS[trglang])
+
+        print 'SENTENCE', srcIndex
+        print 'SOURCE', sources[pair][srcIndex-1]
+        print 'REFERENCE', refs[pair][srcIndex-1]
+        print 'USER', judgeId
+
+        system_list = [(system1rank, system1Id, systems[pair][system1Id][srcIndex-1]),
+                       (system2rank, system2Id, systems[pair][system2Id][srcIndex-1]),
+                       (system3rank, system3Id, systems[pair][system3Id][srcIndex-1]),
+                       (system4rank, system4Id, systems[pair][system4Id][srcIndex-1]),
+                       (system5rank, system5Id, systems[pair][system5Id][srcIndex-1])]
+
+        system_list.sort(key=lambda x: x[0])
+
+        def score(langpair,sentno,system1,system2):
+            score = 0
+            try:
+                pair = '%s < %s' % (system1, system2)
+                revpair = '%s < %s' % (system2, system1)
+                score = RANKINGS[langpair][sentno].get(pair,0) - RANKINGS[langpair][sentno].get(revpair,0)
+            except KeyError:
+                # print 'ERROR ON KEY', langpair,sentno,pair,revpair
+                return 0
+
+            # print 'SCORE(%s, %d, %s < %s) = %d' % (langpair, sentno, system1, system2, score)
+            return score
+
+        s = [[score(pair,srcIndex,system_list[y][1],system_list[x][1]) for x in range(5)] for y in range(5)]
+        # print s
+
+        print '%s | %2d %2d %2d %2d | %s [%s]' % (system_list[0][0], s[0][1], s[0][2], s[0][3], s[0][4], system_list[0][2], system_list[0][1])
+        print '%s |    %2d %2d %2d | %s [%s]' % (system_list[1][0],           s[1][2], s[1][3], s[1][4], system_list[1][2], system_list[1][1])
+        print '%s |       %2d %2d | %s [%s]' % (system_list[2][0],                     s[2][3], s[2][4], system_list[2][2], system_list[2][1])
+        print '%s |          %2d | %s [%s]' % (system_list[3][0],                       s[3][4], system_list[3][2], system_list[3][1])
+        print '%s |             | %s [%s]' % (system_list[4][0],                                     system_list[4][2], system_list[4][1])
diff --git a/scripts/wmt_ranking_task.py b/scripts/wmt_ranking_task.py
new file mode 100755
index 0000000..556ecb0
--- /dev/null
+++ b/scripts/wmt_ranking_task.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Project: Appraise evaluation system
+ Author: Matt Post <post@cs.jhu.edu>
+
+This script takes a set of parallel files (source, reference, and system translations) and writes
+out the XML file used to setup the corresponding Appraise tasks for WMT reranking. It supports many
+options, such as limiting the maximum length of a source sentence (-maxlen, default 30), inserting
+controls (-controls file) with a certain probability (-control_prob, default 1.0, meaning every HIT
+will have a control), and so on.
+
+"""
+
+import os
+import sys
+import math
+import random
+import hashlib
+import argparse
+from ranking_task import RankingTask,Control
+
+PARSER = argparse.ArgumentParser(description="Build evaluation task input file.")
+PARSER.add_argument("source", type=file, help="source language file")
+PARSER.add_argument("reference", type=file, nargs="?", help="reference language file")
+PARSER.add_argument("system", metavar="system", nargs="*", type=file, help="parallel files to compare")
+PARSER.add_argument("-id", type=str, default="none", help="ID name to use for the system name")
+PARSER.add_argument("-source", type=str, default="spa", dest="sourceLang", help="the source language")
+PARSER.add_argument("-target", type=str, default="eng", dest="targetLang", help="the target language")
+PARSER.add_argument("-numhits", type=int, default=100, help="number of HITs in the batch")
+PARSER.add_argument("-tasksperhit", type=int, default=3, help="number of HITs in the batch")
+PARSER.add_argument("-systemspertask", type=int, default=5, help="number of systems to rerank")
+PARSER.add_argument("-redundancy", type=int, default=10, help="number of redundant HITs in the batch")
+PARSER.add_argument('-maxlen', type=int, default=30, help='maximum source sentence length')
+PARSER.add_argument('-seed', type=int, default=None, help='random seed')
+PARSER.add_argument('-no-sequential', dest='sequential', default=True, action='store_false', help='whether sentences within a HIT should be sequential')
+PARSER.add_argument('-controls', type=str, default=None, dest="controlFile", help='file containing controls to use (implies -no-sequential)')
+PARSER.add_argument('-control_prob', type=float, default=1.0, dest="control_prob", help='probability of inserting a control into a HIT')
+PARSER.add_argument('-save', type=str, default=None, dest="saveDir", help='directory to save reduced corpora to')
+
+def random_from_range(range_max, num_draws, tuple_size = 3, sequential = True):
+    """Returns a set of tuples (of size `size') of numbers, representing sentences to use in constructing a HIT. `range_max' is the number of sentences, `num_draws' is the number of HITs to create, `tuple_size' is the number of sentences in each HIT, and `sequential' indicates that we should draw sentences in block groups."""
+    
+    """Returns a set of 'num' unique integers from the range (0, max-1)."""
+
+    blocks = []
+    if sequential is True:
+        num_blocks = int(math.ceil(1.0 * range_max / tuple_size))
+        sentences = range(num_blocks)
+        random.shuffle(sentences)
+        blocks = [tuple(range(block, block + tuple_size)) for block in sentences]
+    else:
+        sentences = range(range_max)
+        random.shuffle(sentences)
+
+        blocks = [tuple([sentences.pop(random.randint(0, len(sentences) - 1)) for x in range(tuple_size)]) for x in range(num_draws)]
+
+    return blocks
+
+if __name__ == "__main__":
+    args = PARSER.parse_args()
+
+    # SANITY CHECKING AND DEPENDENT VARIABLE SETTING
+
+    if args.seed is not None:
+        random.seed(args.seed)
+
+    num_unique_hits = args.numhits - args.redundancy
+
+    controls = []
+    if args.controlFile is not None:
+        args.sequential = False
+
+        controls = Control.load(args.controlFile)
+#        print 'Read %d controls, keeping %d best' % (len(controls), args.numhits - args.redundancy)
+        controls = controls[:args.numhits-args.redundancy]
+
+        if len(controls) < num_unique_hits:
+            sys.stderr.write('* WARNING: not enough controls (%d < %d)\n' % (len(controls), num_unique_hits))
+
+    # BEGIN 
+
+    source = []
+    for line in args.source:
+        source.append(line.strip())
+    
+    reference = []
+    if args.reference:
+        for line in args.reference:
+            reference.append(line.strip())
+
+    if len(reference) != len(source):
+        sys.stderr.write('* FATAL: reference length (%d) != source length (%d)\n' % (len(source), len(reference)))
+        sys.exit(1)
+
+    systems = []
+    system_names = []
+    if len(args.system):
+        for i, system in enumerate(args.system):
+            systems.append([])
+            system_name = os.path.basename(system.name)
+            system_names.append(system_name)
+            for line in system:
+                systems[i].append(line.strip())
+
+            if len(systems[i]) != len(source):
+                sys.stderr.write('* FATAL: system %s length (%d) != source length (%d)\n' % (system_name, len(source), len(reference)))
+                sys.exit(1)
+
+    system_hashes = [hashlib.sha1(x).hexdigest() for x in system_names]
+
+    # Remove sentences that are too long.
+    i = 0
+    while i < len(source):
+        if len(source[i].split()) > args.maxlen:
+            for system in [source,reference] + systems:
+                system.pop(i)
+        else:
+            i += 1
+
+    def dump_system(system_file, lines):
+        outfile = os.path.join(args.saveDir, os.path.basename(system_file.name))
+        if not os.path.exists(outfile):
+            sys.stderr.write('DUMPING TO %s\n' % (outfile))
+            out = open(outfile, 'w')
+            for line in lines:
+                out.write('%s\n' % (line))
+            out.close()
+
+    # Save corpora if requested and not already existing
+    if args.saveDir is not None:
+        if not os.path.exists(args.saveDir):
+            os.makedirs(args.saveDir)
+        dump_system(args.source, source)
+        dump_system(args.reference, reference)
+        for i,system in enumerate(args.system):
+            dump_system(system, systems[i])
+
+    random_blocks = random_from_range(len(source), args.numhits - args.redundancy, tuple_size = args.tasksperhit, sequential = args.sequential)
+    hits = []
+    for sentnos_tuple in random_blocks:
+
+        # Randomize the selection of systems
+        system_indexes = range(len(systems))
+        random.shuffle(system_indexes)
+        system_indexes = system_indexes[:args.systemspertask]
+
+        tasks = [RankingTask(id, source[id], reference[id], [system_names[sysid] for sysid in system_indexes], [systems[sysid][id] for sysid in system_indexes]) for id in sentnos_tuple]
+
+        # Randomly decided whether to randomly replace one of the tasks with a random control.  That
+        # is, we roll a dice to see whether to insert a control (determined by
+        # args.control_prob). If so, we randomly choose which HIT to replace, and then randomly
+        # choose one of the remaining controls to put there.
+        if len(controls):
+            if random.random() < args.control_prob:
+                tasks[random.randint(0, len(tasks)-1)] = controls.pop(random.randint(0,len(controls)-1))
+
+        # sentnos_str = ",".join([`x.id` for x in tasks])
+        sentnos_str = "-1"
+        hit = '  <hit block-id="%s" source-language="%s" target-language="%s">' % (sentnos_str, args.sourceLang, args.targetLang)
+        for task in tasks:
+            hit += task.xml()
+        hit += '\n  </hit>'
+
+        hits.append(hit)
+
+    # Now create redundant HITs
+    if args.redundancy > 0:
+        numbers = random_from_range(len(hits), args.redundancy, tuple_size = 1, sequential = False)
+
+        hits += [hits[x[0]] for x in numbers]
+
+    print '<hits>'
+    for hit in hits:
+        print hit
+    print '</hits>'

From 81ffe5edcff2c6082cdb5a18b8d97117b5aaeb38 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Wed, 5 Mar 2014 16:40:07 -0500
Subject: [PATCH 2/2] Added code used to build controls

---
 scripts/build_controls.pl       |  27 +++++
 scripts/find-agreed-rankings.pl | 202 ++++++++++++++++++++++++++++++++
 2 files changed, 229 insertions(+)
 create mode 100755 scripts/build_controls.pl
 create mode 100755 scripts/find-agreed-rankings.pl

diff --git a/scripts/build_controls.pl b/scripts/build_controls.pl
new file mode 100755
index 0000000..99afe17
--- /dev/null
+++ b/scripts/build_controls.pl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my %langs = (
+  German => 'de',
+  Russian => 'ru',
+  Spanish => 'es',
+  Czech => 'cs',
+  French => 'fr',
+);
+
+# Researcher dump file
+#my $dump = "~/Dropbox/research/WMT13/wmt13-export-20130604a.txt";
+my $dump = "/Users/post/Dropbox/research/WMT13/wmt13-export-20130630.txt";
+
+foreach my $lang (keys(%langs)) {
+  my $shortlang = $langs{$lang};
+
+  system("mkdir","-p","controls/$shortlang-en") unless -d "controls/$shortlang-en";
+  system("mkdir","-p","controls/en-$shortlang") unless -d "controls/en-$shortlang";
+
+  die "problem with $shortlang-en" if system("perl $ENV{APPRAISE}/scripts/find-agreed-rankings.pl $lang,English $dump ~/expts/wmt13/data/maxlen30/$shortlang-en/newstest2013-src.$shortlang ~/expts/wmt13/data/maxlen30/$shortlang-en/newstest2013-ref.en ~/expts/wmt13/data/maxlen30/$shortlang-en/newstest2013.$shortlang-en.> controls/$shortlang-en/controls.txt");
+
+  die "problem with en-$shortlang" if system("perl $ENV{APPRAISE}/scripts/find-agreed-rankings.pl English,$lang $dump ~/expts/wmt13/data/maxlen30/en-$shortlang/newstest2013-src.en ~/expts/wmt13/data/maxlen30/en-$shortlang/newstest2013-ref.$shortlang ~/expts/wmt13/data/maxlen30/en-$shortlang/newstest2013.en-$shortlang.> controls/en-$shortlang/controls.txt");
+}
diff --git a/scripts/find-agreed-rankings.pl b/scripts/find-agreed-rankings.pl
new file mode 100755
index 0000000..83242ff
--- /dev/null
+++ b/scripts/find-agreed-rankings.pl
@@ -0,0 +1,202 @@
+#!/usr/bin/perl
+
+# This script takes the manual judgment data and finds sentences with a high degree of agreement on
+# the rankings.  The input data is a summary of the ranking tasks, output by Omar's ranking analysis
+# tool in Maise, and having the following format:
+# 
+#   srclang,trglang,srcIndex,documentId,segmentId,judgeId,system1Number,system1Id,system2Number,system2Id,system3Number,system3Id,system4Number,system4Id,system5Number,system5Id,system1rank,system2rank,system3rank,system4rank,system5rank
+#
+# The first row is this header row.  From this, we compute a variety of statistics useful for
+# embedding controls within Maise.
+
+
+use strict;
+use warnings;
+
+if (@ARGV != 5) {
+  print "Usage: find-agreed-rankings.pl <LANG_PAIR> <RANK_FILE> <SOURCE_FILE> <REFERENCE_FILE> <SYSTEMS_PREFIX>\n";
+  exit;
+}
+my ($langpair,$ranking_file,$source_file,$ref_file,$systems_prefix) = @ARGV;
+
+#
+# Read in all the sentences
+#
+my %sentences = (
+	source => read_lines($source_file),
+	reference => read_lines($ref_file),
+);
+
+print STDERR "Found " . scalar(keys %{$sentences{source}}) . " source sentences in '$source_file'.\n";
+print STDERR "Found " . scalar(keys %{$sentences{reference}}) . " references in '$ref_file'.\n";
+
+if (scalar(keys %{$sentences{source}}) != scalar(keys %{$sentences{reference}})) {
+  print STDERR "* FATAL: source and reference sentence counts don't match\n";
+  exit;
+}
+
+my @system_files = glob("$systems_prefix*");
+foreach my $file (@system_files) {
+	my $system = (split(/\//,$file))[-1];
+	$sentences{$system} = read_lines($file);
+	print STDERR "Found " . scalar(keys %{$sentences{$system}}) . " sentences for $system in '$file'.\n";
+}
+$sentences{_ref} = $sentences{reference};
+
+# 
+# Read in all the rankings.
+#
+open RANK, $ranking_file or die "ranking_file?";
+chomp(my $header = <RANK>);
+$header =~ s/\r\n//g;
+my @columns = split(',', $header);
+
+# raw_ranks records, for each sentence, the number of times that system A was recorded as being
+# better than (= having a lower score than) system B.  The rankings are recorded as paired keys.
+my %raw_ranks;
+
+# this counts the number of lines matching the requested language pair.
+my $matching_lines = 0;
+
+# this stores the matching HITs as they are read in.  actually, they're not HITs, but ranking tasks.
+my %HITS;
+while (my $line = <RANK>) {
+  last if $line eq "";
+
+	# filter to just the language pair we care about
+	next unless $line =~ /^$langpair/;
+
+	# skip references
+  # next if $line =~ /_ref/;
+
+	$matching_lines++;
+
+  my %hit = build_hit($line);
+
+  # We only need one instead of each HIT, so enter one as the archetype
+  my $hitstr = "$hit{segmentId} $hit{system1Id} $hit{system2Id} $hit{system3Id} $hit{system4Id} $hit{system5Id}";
+  $HITS{$hitstr} = \%hit;
+
+  my @systems = ($hit{system1Id},$hit{system2Id},$hit{system3Id},$hit{system4Id},$hit{system5Id});
+  my @ranks   = ($hit{system1rank},$hit{system2rank},$hit{system3rank},$hit{system4rank},$hit{system5rank});
+	my $sentno = $hit{srcIndex};
+#  print "$sentkey " . join("  ", @systems) . "  " . join("-", @ranks) . $/;
+
+  # consider all pairs, mark a vote for each outranking
+  for (my $i = 0; $i < @systems; $i++) {
+		for (my $j = 0; $j < @systems; $j++) {
+			# a lower rank corresponds to a higher rating
+			if ($ranks[$i] < $ranks[$j]) {
+				$raw_ranks{$sentno}{$systems[$i],$systems[$j]}++;
+			}
+		}
+  }
+}
+close(RANK);
+
+if ($matching_lines == 0) {
+	print "* FATAL: Found no lines matching language pair '$langpair'\n";
+	exit;
+}
+
+# score the entries so they can be sorted
+foreach my $hit (values(%HITS)) {
+  my @systems = ($hit->{system1Id},$hit->{system2Id},$hit->{system3Id},$hit->{system4Id},$hit->{system5Id});
+  my @ranks   = ($hit->{system1rank},$hit->{system2rank},$hit->{system3rank},$hit->{system4rank},$hit->{system5rank});
+	my $sentno = $hit->{srcIndex};
+
+  # Score the HIT using the summed counts over all HIT tokens of this type stored in raw_ranks
+	$hit->{score} = 0;
+	for my $i (0..4) {
+		for my $j (($i+1)..4) {
+			my $count1 = $raw_ranks{$sentno}{$systems[$i],$systems[$j]} || 0;
+			my $count2 = $raw_ranks{$sentno}{$systems[$j],$systems[$i]} || 0;
+			$hit->{score} += abs($count1 - $count2);
+		}
+	}
+}
+
+# now print everything out
+HIT: foreach my $hit (sort { $b->{score} <=> $a->{score} } values(%HITS)) {
+  my @systems = ($hit->{system1Id},$hit->{system2Id},$hit->{system3Id},$hit->{system4Id},$hit->{system5Id});
+  my @ranks   = ($hit->{system1rank},$hit->{system2rank},$hit->{system3rank},$hit->{system4rank},$hit->{system5rank});
+
+  my $langpair = "$hit->{srclang},$hit->{trglang}";
+  my $sentno   = $hit->{srcIndex};
+  my $sentkey  = "$langpair,$sentno";
+  # print "SENTKEY $sentkey " . join("  ", @systems) . "  " . join("-", @ranks) . $/;
+
+  # Skip this HIT if we don't have files for all the systems
+  foreach my $system (@systems) {
+    if (! defined $sentences{$system}) {
+      print STDERR "* SKIPPING HIT with missing system '$system'\n";
+      next HIT;
+    }
+  }
+
+	print "SENTENCE $sentno\n";
+	print "SOURCE $sentences{source}{$sentno}\n";
+	print "REFERENCE $sentences{reference}{$sentno}\n";
+	print "SYSTEMS " . join(" ", @systems) . "\n";
+	for my $i (0..4) {
+		my $sent = $sentences{$systems[$i]}{$sentno};
+		if (! defined $sent) {
+			print STDERR "* FATAL: no sentence $sentno for system $systems[$i]\n";
+			exit;
+		}
+		print "$sent\n";
+	}
+
+	my $score = 0;
+   # consider all pairs, mark a vote for each outranking
+  for (my $i = 0; $i < @systems; $i++) {
+		for (my $j = 0; $j < @systems; $j++) {
+			if ($i == $j) {
+				print "- ";
+			} else {
+				my $count = $raw_ranks{$sentno}{$systems[$i],$systems[$j]} || 0;
+				print "$count ";
+			}
+		}
+		print "\n";
+	}
+}
+close(RANK);
+
+print STDERR "Printed statistics for $matching_lines systems\n";
+
+## SUBROUTINES #######################################################
+
+sub build_hit {
+  my ($line) = @_;
+
+  $line =~ s/\s+$//;
+
+  my %hit;
+
+  chomp(my @tokens = split(',', $line));
+  if (scalar @tokens != scalar @columns) {
+		print "* FATAL: wrong number of columns at line $.\n";
+		exit;
+  }
+  for my $i (0..$#tokens) {
+		$hit{$columns[$i]} = $tokens[$i];
+  }
+
+	return %hit;
+}
+
+
+sub read_lines {
+	my ($file) = @_;
+	my %hash;
+
+	open READ, $file or die "$file?";
+	while (<READ>) {
+		chomp;
+		$hash{$.} = $_;
+	}
+	close READ;
+
+	return \%hash;
+}