In [1]:
import pandas as pd
from collections import Counter
import numpy as np

In [2]:
df = pd.read_csv("emma.tsv", sep='\t')

In [3]:
de_en = Counter()

for idx, row in df.iterrows():
    de_en[(row['German Source'], row['English Translation'])] += 1

In [4]:
duplicated = set(k for k,v in  de_en.items() if v > 1)

In [5]:
df_iaa = []
df_unique = []

for idx, row in df.iterrows():
    if (row['German Source'], row['English Translation']) in duplicated:
        df_iaa.append(row)
    else:
        df_unique.append(row)

In [6]:
_df = pd.DataFrame(df_unique)
_df = _df.fillna('None')

has_error = []

for idx, row in _df.iterrows():
    if row['Error'] == 'None':
        has_error.append(False)
    else:
        has_error.append(True)

_df['has_error'] = has_error

clean_rows = []

for idx, row in _df.iterrows():
    # Remove erroneous lines.
    if row['Error'] != 'None' and row['Degree'] == 'None':
        continue
    else:
        clean_rows.append(row)
            
df_clean = pd.DataFrame(clean_rows)
df_clean.to_parquet('mqm-v1.parquet')

In [7]:
clean_rows = []

for idx, row in _df.iterrows():
    # Remove erroneous lines.
    if row['Error'] != 'None' and row['Degree'] == 'None':
        continue
    else:
        clean_rows.append(row)
            
df_clean = pd.DataFrame(clean_rows)

In [8]:
df_clean

Unnamed: 0,Annotator,German Source,English Translation,Error,Degree,has_error
2456,3,Dove Duschgel von Schönheit Seide wertvolle 70...,Dove shower gel from beauty silk valuable 700 ...,Bad Source,Major,True
2457,3,Dr Neons sind super bright Color Coated Saiten...,Dr Neons are super bright color coated strings...,Adequacy,Minor,True
2458,3,EK Water Blocks 38300469984538.5W/m · K Hörer ...,EK Water Blocks 38300469984538.5W/m · K earpho...,Bad Source,Major,True
2459,3,Edelstahl Harz,stainless steel resin,,,False
2460,3,"Eine lustige Glocke mit Aufgabenkarten, die Ih...",A funny bell with task cards that brings your ...,Bad Source,Major,True
...,...,...,...,...,...,...
10788,2,Dein Lieblingsfigur aus dem Videospiel Maincra...,Your favorite character from the Maincraft vid...,,,False
10789,2,Der Boden ist nicht glasiert und hat einen gew...,The floor is not glazed and has a certain anti...,Bad Source,Minor,True
10790,2,Der Funky Zoo von Marino Degano hat jetzt sein...,Marino Degano's Funky Zoo now has its own Jura...,,,False
10791,2,Der Swordfish Scribbler Batteriespitzer ist fü...,The Swordfish Scribbler battery sharpener is s...,Bad Source,Major,True


In [9]:
sum(len(i.split()) for i in df_clean['German Source'])

167443

In [10]:
sum(df_clean['has_error']) / len(df_clean)

0.48051456678017407

In [11]:
Counter(df_clean['Error'])

Counter({'Bad Source': 2177,
         'Adequacy': 1345,
         'None': 4119,
         'Fluency': 176,
         'Bad source': 12,
         'Number': 11,
         'Numbers': 7,
         'Adequacy ': 50,
         'Bad Source ': 21,
         'Fluency ': 5,
         'Fleuncy': 2,
         'Other': 4})

In [12]:
Counter(df_clean['Degree'])

Counter({'Major': 1933,
         'Minor': 1581,
         'None': 4119,
         'Critical': 287,
         ' Minor': 1,
         'Cricital': 3,
         'Circal': 1,
         'Critical ': 1,
         'MInor': 3})

In [13]:
label_correction = {'Bad source': 'Bad Source', 
 'Bad Source ':'Bad Source',
 'Adequacy ': 'Adequacy',
 'Fluency ': 'Fluency',
 'Fleuncy':  'Fluency',
 'Numbers': 'Number'
}

sev_correction = {' Minor': 'Minor',
                  'Cricital': 'Critical',
                  'Cricital': 'Critical',
                  'Circal': 'Critical',
                  'Critical ':'Critical',
                  'MInor': 'Minor',
}



df_clean = df_clean.replace({"Error": label_correction, 'Degree': sev_correction})

In [14]:
Counter(df_clean['Error'])

Counter({'Bad Source': 2210,
         'Adequacy': 1395,
         'None': 4119,
         'Fluency': 183,
         'Number': 18,
         'Other': 4})

In [15]:
Counter(df_clean['Degree'])

Counter({'Major': 1933, 'Minor': 1585, 'None': 4119, 'Critical': 292})

In [16]:
df_clean[df_clean['Error'] == 'Bad Source'].to_csv('emma-bad-source.tsv', sep='\t', index=False)

In [17]:
Counter(df_clean['Error'])

Counter({'Bad Source': 2210,
         'Adequacy': 1395,
         'None': 4119,
         'Fluency': 183,
         'Number': 18,
         'Other': 4})

In [18]:
Counter(df_clean['Degree'])

Counter({'Major': 1933, 'Minor': 1585, 'None': 4119, 'Critical': 292})

In [19]:
df_clean.to_parquet('mqm-v1.parquet')

In [20]:
df_clean

Unnamed: 0,Annotator,German Source,English Translation,Error,Degree,has_error
2456,3,Dove Duschgel von Schönheit Seide wertvolle 70...,Dove shower gel from beauty silk valuable 700 ...,Bad Source,Major,True
2457,3,Dr Neons sind super bright Color Coated Saiten...,Dr Neons are super bright color coated strings...,Adequacy,Minor,True
2458,3,EK Water Blocks 38300469984538.5W/m · K Hörer ...,EK Water Blocks 38300469984538.5W/m · K earpho...,Bad Source,Major,True
2459,3,Edelstahl Harz,stainless steel resin,,,False
2460,3,"Eine lustige Glocke mit Aufgabenkarten, die Ih...",A funny bell with task cards that brings your ...,Bad Source,Major,True
...,...,...,...,...,...,...
10788,2,Dein Lieblingsfigur aus dem Videospiel Maincra...,Your favorite character from the Maincraft vid...,,,False
10789,2,Der Boden ist nicht glasiert und hat einen gew...,The floor is not glazed and has a certain anti...,Bad Source,Minor,True
10790,2,Der Funky Zoo von Marino Degano hat jetzt sein...,Marino Degano's Funky Zoo now has its own Jura...,,,False
10791,2,Der Swordfish Scribbler Batteriespitzer ist fü...,The Swordfish Scribbler battery sharpener is s...,Bad Source,Major,True
