In [1]:
import csv
import json
import re
import uuid
import math
import numpy as np
import random
from datetime import date
from pathlib import Path

from collections import defaultdict as ddict


# Step 1: Download the original MASSIVE data

In [2]:
# MASSIVE data: https://github.com/alexa/massive

# !wget https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.1.tar.gz
# !tar -xvzf amazon-massive-dataset-1.1.tar.gz

# Update path to where you stored the MASSIVE dataset

data_path = Path('../data/1.1/data')

In [3]:
data = dict()

for path in data_path.iterdir():
    if '._' not in str(path):
    	with open(path, 'r') as fin:
    		for line in fin:
    			l = json.loads(line)

                # We work with the 1685 utterances in the QA split (ignoring other 18K examples in MASSIVE)
    			if l['scenario']=='qa':
    				if l['id'] not in data:
    					data[l['id']] = []
    				data[l['id']].append(l)

write_data = False

if write_data:

    with open('qa_all_langs_dict.json', 'w') as fout:
       json.dump(data, fout)

In [4]:
len(data)

1685

In [5]:
data['681']

[{'id': '681',
  'locale': 'sv-SE',
  'partition': 'train',
  'scenario': 'qa',
  'intent': 'qa_stock',
  'utt': 'håll mig uppdaterad om börskurser',
  'annot_utt': 'håll mig uppdaterad om [news_topic : börskurser]',
  'worker_id': '3',
  'slot_method': [{'slot': 'news_topic', 'method': 'translation'}],
  'judgments': [{'worker_id': '2',
    'intent_score': 1,
    'slots_score': 1,
    'grammar_score': 4,
    'spelling_score': 2,
    'language_identification': 'target'},
   {'worker_id': '0',
    'intent_score': 1,
    'slots_score': 1,
    'grammar_score': 4,
    'spelling_score': 2,
    'language_identification': 'target'},
   {'worker_id': '20',
    'intent_score': 1,
    'slots_score': 1,
    'grammar_score': 4,
    'spelling_score': 2,
    'language_identification': 'target'}]},
 {'id': '681',
  'locale': 'sw-KE',
  'partition': 'train',
  'scenario': 'qa',
  'intent': 'qa_stock',
  'utt': 'nisasishe kuhusu bei za soko la hisa',
  'annot_utt': 'nisasishe kuhusu [news_topic : bei z

In [6]:
# Get a look at the different types of questions in the QA split

intent_count = ddict(int)

for k, v in data.items():
    eng_item = v[0]
    intent_count[eng_item['intent']]+=1
    
print(intent_count)

defaultdict(<class 'int'>, {'qa_stock': 202, 'qa_factoid': 775, 'qa_definition': 379, 'qa_maths': 116, 'qa_currency': 213})


In [7]:
## getting sample from qa_factoid

qa_factoid_sample = list()

for k, v in data.items():
    for item in v:
        if item['locale']=='en-US':
            if item['intent']=='qa_factoid':
                if random.random() < 0.1:
                    qa_factoid_sample.append((item['utt'], item['id']))
        

In [8]:
qa_factoid_sample[0]

('how tall is a giraffe', '12952')

In [9]:
write_data = False

if write_data:

    with open('qa_factoid_sample.csv', 'w') as fout:
        csv_out=csv.writer(fout)
        csv_out.writerow(['utt', 'id'])
        for row in qa_factoid_sample:
            csv_out.writerow(row)

# Step 2: Get the MASSIVE-AMR data

In [10]:
# !git clone https://github.com/amazon-science/MASSIVE-AMR.git


In [11]:
# This file is plain AMRs: data/massive_amr.txt
# This file contains MASSIVE annotations for EN-only: data/massive_amr.jsonl

# import jsonlines

# Set path to the massive_amr.txt

amr_path = '../data/massive_amr.txt'

# Step 3: Process and validate AMRs

In [12]:
'''
 AMR Sentence Object
 From: <https://github.com/panx27/amr-reader>
'''

class Sentence(object):
    def __init__(self, sentid='', sent='', raw_amr='', comments='',
                 amr_nodes=dict(), graph=list()):
        self.sentid = sentid         # Sentence id
        self.sent = sent             # Sentence
        self.raw_amr = raw_amr       # Raw AMR
        self.comments = comments     # Comments
        self.amr_nodes = amr_nodes   # AMR ndoes table
        self.graph = graph           # Path of the whole graph
        self.amr_paths = dict()      # AMR paths
        self.named_entities = dict() # Named entities

    def __str__(self):
        return '%s%s\n' % (self.comments, self.raw_amr)


def amr_validator(raw_amr): # TODO: add more test cases
    '''
    AMR validator

    :param str raw_amr:
    :return bool:
    '''
    if raw_amr.count('(') == 0:
        return False
    if raw_amr.count(')') == 0:
        return False
    if raw_amr.count('(') != raw_amr.count(')'):
        return False
    return True

def main_process_amrs(raw_amrs):
    '''
    :param str raw_amrs: input raw amrs, separated by '\n'
    :return list res: Sentence objects
    '''
    res = []
    for i in re.split('\n\s*\n', raw_amrs):
        sent = re.search('::snt (.*?)\n', i)
        sent = sent.group(1) if sent else ''
        sentid = re.search('::id (.*?)\n', i)
        if sentid:
            sentid = sentid.group(1)
        else:
            sentid = uuid.uuid4()

        raw_amr = ''
        comments = ''
        for line in i.splitlines(True):
            if line.startswith('# '):
                comments += line
                continue

            # convert '( )' to '%28 %29' in :wiki
            m = re.search(':wiki\s\"(.+?)\"', line)
            if m:
                line = line.replace(m.group(1),
                                    urllib.parse.quote_plus(m.group(1)))

            # convert '( )' to '%28 %29' in :name
            m = re.findall('\"(\S+)\"', line)
            for i in m:
                if '(' in i or ')' in i:
                    line = line.replace(i, urllib.parse.quote_plus(i))
            raw_amr += line

        if not raw_amr:
            continue
        if not amr_validator(raw_amr):
            raise Exception('Invalid raw AMR: %s' % sentid)

        sent_obj = Sentence(sentid, sent, raw_amr, comments)
        res.append(sent_obj)

    return res

In [13]:
list_amrs = list()

with open(amr_path, 'r') as f:
    raw_amrs = f.read()
    res = main_process_amrs(raw_amrs)
    list_amrs += res

In [14]:
len(list_amrs)

1685

In [15]:
list_amrs[0].sent

'what are some updates about the stock market'

In [16]:
list_amrs[2].sentid

'15090'

In [17]:
# Getting the multilingual entities from the original MASSIVE data
# TODO: Confirm it's extracting all entities and entity annotations

def create_named_entity(name):

    # Preprocessing the entity annotations
    
    named_entity = '/ name '
    
    name = name.split()
    thisIdx = 1
    for idx, n in enumerate(name):
        if n not in ['the', 'this']:
            n = n.replace("'s", "")
            token = f':op{thisIdx} "{n.strip()}" '
            named_entity += token
            thisIdx += 1
      
    named_entity = named_entity.strip()
    named_entity += ')'
     
    return named_entity

def get_name_from_annotation(utt):
    
    entities = []
    
    entity_found = re.findall('\[(.*?)\]', utt)
    
    if entity_found:

        #entity = entity.group(1).split(':')[1]
        #entity = create_named_entity(entity.strip()) 
        
        for ef in entity_found:
            ef_split = ef.split(':')[1]
            ef_split = create_named_entity(ef_split.strip())
            ef_type = ef.split(':')[0].strip()
            entities.append((ef_type, ef_split))
         
    else:
        entities = [(0,'NONE')]
        
    return entities
        

In [18]:
# Making text-to-AMR mappings for all languages
# Collecting some statistics
# Making training, validation, and test splits
# Output: Standard txt files for AMR data, fields are: ::id, ::en_utt, ::annot_utt, ::snt (in L2) followed by AMR

snt_lens_tokens, snt_lens_chars = list(), list()
snt_lens_tokens_en, snt_lens_chars_en = list(), list()

train_path = Path('../data/amrs-massive-train.txt')
val_path = Path('../data/amrs-massive-val.txt')
test_path = Path('../data/amrs-massive-test.txt')

train, val, test = [], [], []

for idx, item in enumerate(list_amrs):
    sentid = item.sentid
    raw_amr = item.raw_amr
    
    all_elements = data[sentid]
    
    for element in all_elements:
        en_utt = ''
        if element['locale'] == 'en-US':
            en_utt = element['annot_utt']  
            
            snt_lens_tokens_en.append(len(en_utt.split()))
            snt_lens_chars_en.append(len(en_utt))
            break      
            
    en_entities = get_name_from_annotation(en_utt)
    
    for element in all_elements:
        
        utt = element['utt']
        
        snt_lens_tokens.append(len(utt.split()))
        snt_lens_chars.append(len(utt))
        
        annot_utt = element['annot_utt']
        sentid = element['id']
        locale = element['locale']
        
        other_entities = get_name_from_annotation(annot_utt)
        
        temp_raw_amr = raw_amr
        
        if 'definition_word' in annot_utt:
            # Questions about definitions never have annotations?
            continue
            
        else:
            
            en_entities = sorted(en_entities)
            other_entities = sorted(other_entities)
            for en_entity, ot_entity in zip(en_entities, other_entities):
                
                if en_entity[0]==ot_entity[0] and en_entity[1] != 'NONE' and ot_entity[1] != 'NONE':
                    #print(en_entity[1], ot_entity[1])
                    temp_raw_amr = temp_raw_amr.replace(en_entity[1], ot_entity[1])
                    #temp_raw_amr = temp_temp_raw_amr


        # Arbitrary choices! Decide on your own splits! 
        # Standard format for AMR txt files
        if idx < 60:
                val.append(f'# ::id {sentid}-{locale}\n')
                val.append('# ::en_utt '+ en_utt + '\n')
                val.append('# ::annot_utt '+ annot_utt + '\n')
                val.append('# ::snt '+ utt + '\n')
                val.append(temp_raw_amr)
                val.append('\n')
                val.append('\n')

        elif idx < 120:
                test.append(f'# ::id {sentid}-{locale}\n')
                test.append('# ::en_utt '+ en_utt + '\n')
                test.append('# ::annot_utt '+ annot_utt + '\n')
                test.append('# ::snt '+ utt + '\n')
                test.append(temp_raw_amr)
                test.append('\n')
                test.append('\n')

        else:
                train.append(f'# ::id {sentid}-{locale}\n')
                train.append('# ::en_utt '+ en_utt + '\n')
                train.append('# ::annot_utt '+ annot_utt + '\n')
                train.append('# ::snt '+ utt + '\n')
                train.append(temp_raw_amr)
                train.append('\n')
                train.append('\n')

write_data = False

if write_data:
    
    with open(train_path, 'w') as outfile:
            outfile.writelines(train)

    with open(val_path, 'w') as outfile:
            outfile.writelines(val)

    with open(test_path, 'w') as outfile:
            outfile.writelines(test)

In [19]:
train[950:980]

['\n',
 '\n',
 '# ::id 13965-ar-SA\n',
 '# ::en_utt what is the exchange rate between [currency_name : u. s. d.] and [currency_name : euro]\n',
 '# ::annot_utt كم سعر صرف بين [currency_name : الريال السعودي] و [currency_name : الدينار الأردني]\n',
 '# ::snt كم سعر صرف بين الريال السعودي و الدينار الأردني\n',
 '(r / rate-01\n      :ARG1 (e / exchange-01\n            :ARG1 (c / currency :name (n / name :op1 "الريال" :op2 "السعودي"))\n            :ARG3 (c2 / currency :name (n2 / name :op1 "الدينار" :op2 "الأردني")))\n      :ARG2 (a / amr-unknown))',
 '\n',
 '\n',
 '# ::id 13965-ka-GE\n',
 '# ::en_utt what is the exchange rate between [currency_name : u. s. d.] and [currency_name : euro]\n',
 '# ::annot_utt რა არის გაცვლითი კურსი [currency_name : ამერიკულ დოლარსა] და [currency_name : ევროს] შორის\n',
 '# ::snt რა არის გაცვლითი კურსი ამერიკულ დოლარსა და ევროს შორის\n',
 '(r / rate-01\n      :ARG1 (e / exchange-01\n            :ARG1 (c / currency :name (n / name :op1 "ევროს"))\n            :

# Step 4: A quick look at the data

In [20]:
# All languages (tokenization by splitting on white space)
print(f'Mean len sent by tokens: {np.mean(snt_lens_tokens):.2f}')
print(f'Mean len sent by chars: {np.mean(snt_lens_chars):.2f}')
print()
print(f'Std len sent by tokens: {np.std(snt_lens_tokens):.2f}')
print(f'Std len sent by chars: {np.std(snt_lens_chars):.2f}')

Mean len sent by tokens: 5.37
Mean len sent by chars: 32.01

Std len sent by tokens: 2.98
Std len sent by chars: 16.62


In [21]:
# English only data
print(f'Mean len EN sent by tokens: {np.mean(snt_lens_tokens_en):.2f}')
print(f'Mean len EN sent by chars: {np.mean(snt_lens_chars_en):.2f}')

Mean len EN sent by tokens: 8.16
Mean len EN sent by chars: 47.23


In [22]:
def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

In [23]:
s1 = '[person : martin luther king junior]'
s2 = '[person : အောင်ဆန်းစုကြည်]'

levenshteinDistance(s1, s2)

match = re.search(r'\[(.*)\]', s2)
math.ceil(len(match.group(0))/2)

13

In [24]:
# Sentences were translated and annotated by professional annotators, see original Massive paper (FitzGerald et al, 2023)
# Annotation in some languages is incomplete.
# How many entity annotations actually exist in the original MASSIVE data? 
# Need to estimate, looking for strings that are very different from English tokens 

cnt_entities, cnt_localized_entities = 0, 0

for i in range(len(list_amrs)):
    
    example_utt_1, example_utt_2 = [], []

    #rand_example = random.sample(list_amrs, 1)[0]
    
    rand_example = list_amrs[i]

    sentid = rand_example.sentid

    all_elements = data[sentid]

    for element in all_elements:
        en_utt = ''
        if element['locale'] == 'en-US':
            en_utt = element['annot_utt']  

            example_utt_1.append(('en-US', en_utt))
            
            break 

    for element in all_elements:

        annot_utt = element['annot_utt']

        utt = element['utt']

        locale = element['locale']

        example_utt_1.append((locale, annot_utt))
        
        if '[' in annot_utt:
            cnt_entities+=1
            if '[' in annot_utt and '[' in en_utt:
                
                annot_ent = re.search(r'\[(.*)\]', annot_utt).group(0)
                en_ent = re.search(r'\[(.*)\]', en_utt).group(0)
                
                if len(en_utt)<len(annot_utt):
                    short_utt = en_utt
                else:
                    short_utt = annot_utt
                    
                threshold_len = math.ceil(len(short_utt) * 0.2)
                
                if levenshteinDistance(annot_ent, en_ent)>threshold_len:
                    
                    cnt_localized_entities += 1
            
                  
print(cnt_entities)           
print(cnt_localized_entities)       
          

60031
28076


In [25]:
example_utt_1

[('en-US', 'please define word'),
 ('sv-SE', 'definiera ord'),
 ('sw-KE', 'tafadhali fafanua jina'),
 ('pt-PT', 'por favor definir palavra'),
 ('pl-PL', 'podaj proszę definicję słowa'),
 ('he-IL', 'בבקשה תגדירי מילה'),
 ('jv-ID', 'tulung nemtokake tembung'),
 ('th-TH', 'ขอคำนิยามของคำ'),
 ('ta-IN', 'தயவுசெய்து வார்த்தையை வரையறுக்கவும்'),
 ('hu-HU', 'kérlek definiáld a szót'),
 ('km-KH', 'សូមកំណត់ពាក្យ'),
 ('fa-IR', 'لطفا کلمه را تعریف کنید'),
 ('cy-GB', 'os gwelwch yn dda diffiniwch y gair'),
 ('af-ZA', 'definieer asseblief woord'),
 ('ro-RO', 'te rog defineste cuvantul'),
 ('fi-FI', 'määrittele sana kiitos'),
 ('hi-IN', 'कृपया शब्द को परिभाषित करें'),
 ('bn-BD', 'শব্দ সংজ্ঞায়িত করুন'),
 ('da-DK', 'definer ord tak'),
 ('my-MM', 'ကျေးဇူးပြု၍စကားလုံးကို အဓိပ္ပါယ်ဖွင့်ပါ'),
 ('zh-CN', '请定义这个词'),
 ('ml-IN', 'ദയവായി പദം നിർവ്വചിക്കുക'),
 ('en-US', 'please define word'),
 ('de-DE', 'bitte definiere wort'),
 ('am-ET', 'እባክዎ ቃል ተርጉሙ'),
 ('nl-NL', 'definieer woord a. u. b.'),
 ('lv-LV', 'lūdzu

In [26]:
example_utt_1, example_utt_2 = [], []

rand_example = random.sample(list_amrs, 1)[0]

sentid = rand_example.sentid

all_elements = data[sentid]

for element in all_elements:
    en_utt = ''
    if element['locale'] == 'en-US':
        en_utt = element['annot_utt']  

        example_utt_1.append(('en-US', en_utt))

        break 
        

In [27]:
example_utt_1

[('en-US', '[artist_name : michael jackson] family background')]

In [28]:
for i in range(len(list_amrs)):
    print(list_amrs[i].sent)

what are some updates about the stock market
definition of velocity
please look up exchange between us and mexico
can you describe to me what a pineapple looks like
what is the dollar against the pound
what does potato mean
how much is the british pound
what is the exchange rate of u. s. d. to cdn
how many people live in san francisco
please search for this word
which is the best smartphone in two thousand and seventeen
who are the greatest guitarists of all time
tell me all about hurricane
describe death
what's the exchange rate between the us dollar and the euro
what is a fjord
olly what does snafu mean
does spain share its border with morrocco
what's the rate between the u. s. d. and ukraine's currency
what's a parsec
please define forensic
how old was elvis when he died
currency to currency rate
what is the us dollar equivalent of one euro
what is a chair
how much is the average house
what causes in burmuda triangle
how old is barack obaba
can you tell me what an chaise is
what is 