# Read xml files

**TODO** - Read and convert SemEval 2016 files

# 2014

In [1]:
import xml.etree.ElementTree as et
import pandas as pd

def read_semeval2014_restaurants(xml_file):
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    
    rows = []
    for el in xroot.findall('.//sentence'):
        text = el.find('.//text').text
        sent_id = el.attrib['id']

        asp_cats = el.findall('.//aspectCategory')
        cats = []
        pols = []
        for asp_cat in asp_cats:
            cats.append(asp_cat.attrib['category'])
            pols.append(asp_cat.attrib['polarity'])

        cats = ', '.join(cats)
        pols = ', '.join(pols)


        asp_terms = el.findall('.//aspectTerm')
        for asp_term in asp_terms:
            res = asp_term.attrib
            res['text'] = text
            res['id'] = sent_id
            res['category'] = cats
            res['category_polarity'] = pols

            rows.append(res)
            
    df = pd.DataFrame(rows).loc[:, ['id', 'text', 'term', 'from', 'to', 
                                    'polarity', 'category', 'category_polarity']]
            
    return df


def read_semeval2014_laptops(xml_file):
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    
    rows = []
    for el in xroot.findall('.//sentence'):
        text = el.find('.//text').text
        sent_id = el.attrib['id']

        asp_terms = el.findall('.//aspectTerm')
        for asp_term in asp_terms:
            res = asp_term.attrib
            res['text'] = text
            res['id'] = sent_id

            rows.append(res)
            
    df = pd.DataFrame(rows).loc[:, ['id', 'text', 'term','from', 'to', 'polarity']]

            
    return df

In [3]:
read_semeval2014_laptops('../data/raw/SemEval2014/laptops_train.xml')

Unnamed: 0,id,text,term,from,to,polarity
0,2339,I charge it at night and skip taking the cord ...,cord,41,45,neutral
1,2339,I charge it at night and skip taking the cord ...,battery life,74,86,positive
2,1316,The tech guy then said the service center does...,service center,27,41,negative
3,1316,The tech guy then said the service center does...,"""sales"" team",109,121,negative
4,1316,The tech guy then said the service center does...,tech guy,4,12,neutral
...,...,...,...,...,...,...
2368,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,104,134,neutral
2369,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,140,170,neutral
2370,848,"How Toshiba handles the repair seems to vary, ...",repair,24,30,conflict
2371,848,"How Toshiba handles the repair seems to vary, ...",repair,130,136,positive


# Convert the xml files to csv

In [3]:
DATA_FILE = "../data/raw/SemEval2014/{domain}_{subset}.{extension}"
DATA_FILE_v2 = "../data/raw/SemEval2014/{domain}_{subset}_v2.{extension}"

In [4]:
for domain in ['restaurants', 'laptops']:
    for subset in ['train', 'test']:
        if subset == 'train':
            DATA_FILES = [DATA_FILE, DATA_FILE_v2]
        else:
            DATA_FILES = [DATA_FILE]
            
        for data_file in DATA_FILES:
            xml_file = data_file.format(domain=domain, subset=subset, extension='xml')
            out_file = data_file.format(domain=domain, subset=subset, extension='csv')

            if domain == 'restaurants':
                read_semeval2014_restaurants(xml_file).to_csv(out_file, index=False)
            elif domain == 'laptops':
                read_semeval2014_laptops(xml_file).to_csv(out_file, index=False)


    
    

# 2016

In [29]:
DATA_FILE_TRAIN = "../data/raw/SemEval2016/ABSA16_{domain}_train_{subtask}.{extension}"
DATA_FILE_TEST = "../data/raw/SemEval2016/EN_{domain}_{subtask}_test_gold.{extension}"

In [30]:
def read_semeval2016_SB1(xml_file, domain):
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()

    rows = []
    for el in xroot.findall('.//sentence'):
        text = el.find('.//text').text
        opinions = el.findall('.//Opinion')
        sent_id = el.attrib['id']

        for op in opinions:

            op_dict = op.attrib
            op_dict['text'] = text
            op_dict['id'] = sent_id

            rows.append(op_dict)

    df = pd.DataFrame(rows)
    df['rid'] = df['id'].str.split(':').map(lambda x: x[0]) # review id
    
    if domain == 'restaurants':
        df.rename(columns={'target': 'term'}, inplace=True)
        cols = ['rid', 'id', 'text', 'term', 'from', 'to', 'category', 'polarity', ]
        
    elif domain == 'laptops':
        cols = ['rid', 'id', 'text', 'category', 'polarity']
        
    df = df.loc[:, cols]

    return df

In [31]:
domains = ['restaurants', 'laptops']
subsets = ['train', 'test']

for domain in domains:
    for subset in subsets:
        if subset == 'train':
            DATA_FILE = DATA_FILE_TRAIN
        elif subset == 'test':
            DATA_FILE = DATA_FILE_TEST
            
        xml_file = DATA_FILE.format(domain=domain, subtask='SB1', extension='xml')
        out_file = DATA_FILE.format(domain=domain, subtask='SB1', extension='csv')
        
        read_semeval2016_SB1(xml_file, domain).to_csv(out_file, index=False)
            
            

In [32]:
read_semeval2016_SB1('../data/raw/SemEval2016/ABSA16_restaurants_train_SB1.xml', 'restaurants')

Unnamed: 0,rid,id,text,term,from,to,category,polarity
0,1004293,1004293:0,Judging from previous posts this used to be a ...,place,51,56,RESTAURANT#GENERAL,negative
1,1004293,1004293:1,"We, there were four of us, arrived at noon - t...",staff,75,80,SERVICE#GENERAL,negative
2,1004293,1004293:2,"They never brought us complimentary noodles, i...",,0,0,SERVICE#GENERAL,negative
3,1004293,1004293:3,The food was lousy - too sweet or too salty an...,food,4,8,FOOD#QUALITY,negative
4,1004293,1004293:3,The food was lousy - too sweet or too salty an...,portions,52,60,FOOD#STYLE_OPTIONS,negative
...,...,...,...,...,...,...,...,...
2502,FF#10,FF#10:8,The waitress came to check in on us every few ...,waitress,4,12,SERVICE#GENERAL,negative
2503,FF#10,FF#10:9,I couldn't ignore the fact that she reach over...,,0,0,SERVICE#GENERAL,negative
2504,FF#10,FF#10:10,She then put the check down without asking if ...,,0,0,SERVICE#GENERAL,negative
2505,FF#10,FF#10:11,"I wish I could like this place more, and I wis...",place,25,30,RESTAURANT#GENERAL,negative
