In [11]:
import xml.etree.cElementTree as ET
import pandas as pd

In [12]:
paths = ['data/Laptop_Train_v2.xml',
         'data/Restaurants_Train_v2.xml'
        ]

In [13]:
def parse_one_file(path):
    tree = ET.parse(path)
    root = tree.getroot()  

    data = []
    polarity_set = set()
    for sentence in root.findall('sentence'): # use xPat to find all the <sentence> tags
        text = sentence.find('text').text     # in each 'sentence', find the <text> tag
        aTerms = sentence.find('aspectTerms') # also, in each 'sentence', find all the <aspectTerms> tags
        if aTerms is not None:
            for aTerm in aTerms.findall('aspectTerm'): # find all the <aspectTerm> tag
                term = aTerm.get('term') # in each of the <aspectTerm> tag, get the 'term' attribute
                polarity = aTerm.get('polarity') # also get the 'polarity' attribute
                fr = aTerm.get('from') 
                to = aTerm.get('to')
                polarity_set.add(polarity)
                data.append((text, term, fr, to, polarity)) # put these into the list to prepare for the dataframe
        else:
            #print(text)
            pass

    # check how many different unique polarity values
    return polarity_set, data

In [14]:
final_data = []
for path in paths:
    polarity_set, data = parse_one_file(path)
    print('The polarities in {} are {}'.format(path, polarity_set))
    final_data.extend(data)

The polarities in data/Laptop_Train_v2.xml are {'conflict', 'positive', 'negative', 'neutral'}
The polarities in data/Restaurants_Train_v2.xml are {'conflict', 'positive', 'negative', 'neutral'}


In [15]:
# Now create the dataframe
df = pd.DataFrame(data, columns=['text', 'aspect_term', 'from', 'to', 'polarity'])

# Filter out polarity == 'conflict'
# we don't predict conflict
df = df[df['polarity'].isin(['positive', 'neutral', 'negative'])]

# map polarity from string to numeric
df['polarity'] = df['polarity'].map({'positive':1,'neutral':0, 'negative':-1})

# Save the dataframe to csv
df.to_csv('data/restaurants_laptop_train.csv', index=False)