In [1]:
import pandas as pd
from collections import Counter

import json

In [2]:
# Patches for "bad source".
_df = pd.read_csv('MQM X - Sheet1.tsv', sep='\t')
_df.to_parquet('mqm-v11.parquet')

In [3]:
df10 = pd.read_parquet('mqm-v1.parquet')
df11 = pd.read_parquet('mqm-v11.parquet')

In [4]:
import xxhash

def hashstr(s):
    return xxhash.xxh64(s, seed=42).intdigest()

In [5]:
df10['Source xxhash'] = df10['German Source'].apply(hashstr)
df10['Target xxhash'] = df10['English Translation'].apply(hashstr)

df10['Source\nTarget xxhash'] = (df10['German Source'].astype(str) + "\n" + df10['English Translation'].astype(str)).apply(hashstr)

In [6]:
df11['Source xxhash'] = df11['German Source'].apply(hashstr)
df11['Target xxhash'] = df11['English Translation'].apply(hashstr)

df11['Source\nTarget xxhash'] = (df11['German Source'].astype(str) + "\n" + df11['English Translation'].astype(str)).apply(hashstr)

df11['Annotator'] = 1

In [7]:
bad_source_patch = set(df11['Source\nTarget xxhash'])

error_label_map = {'Adequacy':'Adequacy',
 'Adequacy ':'Adequacy',
 'Bad Source':'Bad Source',
 'Bad Source ':'Bad Source',
 'Bad source':'Bad Source',
 'Bad Source - Bad Encoding': 'Bad Source - Bad Encoding',
 'Bad Source - No meaning':'Bad Source - No meaning',
 'Bad Source - Not German':'Bad Source - Not German',
 'Bad Source - Ungrammatical German':  'Bad Source - Ungrammatical German',
 'Bad Source - others': 'Bad Source - Others',
 'Fleuncy':'Fluency',
 'Fluency':'Fluency',
 'Fluency ':'Fluency',
 'None':'None',
 'Number':'Number',
 'Numbers':'Number',
 'Other':'Other'
}

df10['Error'] = df10['Error'].map(error_label_map)


lol = []

num_rows = []

for idx, row in df10.iterrows():
    num_rows.append(idx)
    if row['Source\nTarget xxhash'] in bad_source_patch:
        patch_row = df11[df11['Source\nTarget xxhash'] == row['Source\nTarget xxhash']].iloc[0]
        #print(idx, row['Error'], '|', patch_row['Error'])
        new_row = json.loads(row.to_json())
        new_row['Error'] = str(patch_row['Error'])
        lol.append(new_row)
    else:
        new_row = json.loads(row.to_json())
        lol.append(new_row)


In [8]:
len(lol)

7929

In [9]:
df20 = pd.DataFrame(lol)

In [10]:
df20['Error'] = df20['Error'].map(error_label_map)

In [11]:
Counter(df20['Error'])

Counter({'Bad Source - No meaning': 1517,
         'Adequacy': 1438,
         'None': 4171,
         'Bad Source - Ungrammatical German': 318,
         'Fluency': 208,
         'Bad Source - Bad Encoding': 68,
         'Bad Source - Not German': 160,
         'Bad Source - Others': 3,
         'Bad Source': 24,
         'Number': 18,
         'Other': 4})

In [12]:
want_errors = {'Adequacy',
 'Bad Source - Bad Encoding',
 'Bad Source - No meaning',
 'Bad Source - Not German',
 'Bad Source - Others',
 'Bad Source - Ungrammatical German',
 'Fluency',
 'None',
 'Number',
}

df20 = df20[df20['Error'].isin(want_errors)]

In [13]:
Counter(df20['Error'])

Counter({'Bad Source - No meaning': 1517,
         'Adequacy': 1438,
         'None': 4171,
         'Bad Source - Ungrammatical German': 318,
         'Fluency': 208,
         'Bad Source - Bad Encoding': 68,
         'Bad Source - Not German': 160,
         'Bad Source - Others': 3,
         'Number': 18})

In [14]:
df20.to_parquet('mqm-v20.parquet')

In [15]:
df20 = pd.read_parquet('mqm-v20.parquet')

In [16]:
df20

Unnamed: 0,Annotator,German Source,English Translation,Error,Degree,has_error,Source xxhash,Target xxhash,Source\nTarget xxhash
0,3,Dove Duschgel von Schönheit Seide wertvolle 70...,Dove shower gel from beauty silk valuable 700 ...,Bad Source - No meaning,Major,True,3696885317974569719,15650179344116872231,14199460433498029932
1,3,Dr Neons sind super bright Color Coated Saiten...,Dr Neons are super bright color coated strings...,Adequacy,Minor,True,10344410422070655780,2498529617461949639,7168514186384691906
2,3,EK Water Blocks 38300469984538.5W/m · K Hörer ...,EK Water Blocks 38300469984538.5W/m · K earpho...,Bad Source - No meaning,Major,True,2783690366837936422,2771310005695235641,3068944554127270805
3,3,Edelstahl Harz,stainless steel resin,,,False,16711829727213546928,12853883588113187030,16689769054036714384
4,3,"Eine lustige Glocke mit Aufgabenkarten, die Ih...",A funny bell with task cards that brings your ...,Bad Source - No meaning,Major,True,12113623277648660465,6563401889861335005,12291077103302061713
...,...,...,...,...,...,...,...,...,...
7924,2,Dein Lieblingsfigur aus dem Videospiel Maincra...,Your favorite character from the Maincraft vid...,,,False,16521102793897854509,14933572471653706979,10560896453293272534
7925,2,Der Boden ist nicht glasiert und hat einen gew...,The floor is not glazed and has a certain anti...,Bad Source - Bad Encoding,Minor,True,9757103257916715282,11362972136126883557,8853681567271631259
7926,2,Der Funky Zoo von Marino Degano hat jetzt sein...,Marino Degano's Funky Zoo now has its own Jura...,,,False,7592128598500749974,10488522431404227725,9124445207112090276
7927,2,Der Swordfish Scribbler Batteriespitzer ist fü...,The Swordfish Scribbler battery sharpener is s...,Bad Source - No meaning,Major,True,8662106083923634055,9183167253751000107,16899085224263822792
