In [1]:
import xml.etree.cElementTree as ET
import pandas as pd

In [2]:
paths = ['data/Laptop_Train_v2.xml',
         'data/Restaurants_Train_v2.xml'
        ]

In [3]:
def parse_one_file(path):
    tree = ET.parse(path)
    root = tree.getroot()  

    data = []
    polarity_set = set()
    polarity_dict_count = dict()
    i = 0
    a = 0
    at = 0
    for sentence in root.findall('sentence'): # use xPat to find all the <sentence> tags
        i += 1
        text = sentence.find('text').text     # in each 'sentence', find the <text> tag
        aTerms = sentence.find('aspectTerms') # also, in each 'sentence', find all the <aspectTerms> tags
        if aTerms is not None:
            a += 1
            for aTerm in aTerms.findall('aspectTerm'): # find all the <aspectTerm> tag
                at += 1
                term = aTerm.get('term') # in each of the <aspectTerm> tag, get the 'term' attribute
                polarity = aTerm.get('polarity') # also get the 'polarity' attribute
                fr = aTerm.get('from') 
                to = aTerm.get('to')
                polarity_set.add(polarity)
                data.append((text, term, fr, to, polarity)) # put these into the list to prepare for the dataframe
                polarity_dict_count[polarity] = polarity_dict_count.get(polarity,0) + 1
        else:
            #print(text)
            pass
    print("Processed: {} sentences".format(i))
    print("Number of Aspect sentences: {}".format(a))
    print("Total number of Aspect found: {}".format(at))

    # check how many different unique polarity values
    return polarity_set, data, polarity_dict_count

In [4]:
final_data = []
for path in paths:
    polarity_set, data, polarity_dict_count = parse_one_file(path)
    print('The polarities in {} are {}'.format(path, polarity_set))
    final_data.extend(data)

Processed: 3045 sentences
Number of Aspect sentences: 1488
Total number of Aspect found: 2358
The polarities in data/Laptop_Train_v2.xml are {'neutral', 'conflict', 'positive', 'negative'}
Processed: 3041 sentences
Number of Aspect sentences: 2021
Total number of Aspect found: 3693
The polarities in data/Restaurants_Train_v2.xml are {'neutral', 'conflict', 'positive', 'negative'}


In [8]:
# Now create the dataframe
df = pd.DataFrame(final_data, columns=['text', 'aspect_term', 'from', 'to', 'polarity'])

# Filter out polarity == 'conflict'
# we don't predict conflict
df = df[df['polarity'].isin(['positive', 'neutral', 'negative'])]

# map polarity from string to numeric
df['polarity'] = df['polarity'].map({'positive':1,'neutral':0, 'negative':-1})

# Save the dataframe to csv
df.to_csv('data/restaurants_laptop_train.csv', index=False)

In [None]:
polarity_set, data, polarity_dict_count = parse_one_file('data/Laptop_Train_v2.xml')

In [None]:
polarity_dict_count

In [None]:
polarity_set, data, polarity_dict_count = parse_one_file('data/Restaurants_Train_v2.xml')

In [None]:
polarity_dict_count