In [6]:
import pandas as pd
import numpy as np
import spacy

In [7]:
path = 'data/restaurants_laptop_train.csv'

In [8]:
df = pd.read_csv(path)
all_texts = df['text'].unique()
df.head()

Unnamed: 0,text,aspect_term,from,to,polarity
0,I charge it at night and skip taking the cord ...,cord,41,45,0
1,I charge it at night and skip taking the cord ...,battery life,74,86,1
2,The tech guy then said the service center does...,service center,27,41,-1
3,The tech guy then said the service center does...,"""sales"" team",109,121,-1
4,The tech guy then said the service center does...,tech guy,4,12,0


In [9]:
sentence_aspect_term_dict = dict()

for row in zip(df['text'], df['aspect_term'], df['from'], df['to'], df['polarity']):
    sentence = row[0]
    aspect_term = {
        'aspect_term': row[1],
        'from': row[2],
        'to': row[3],
        'polarity': row[4]
    }
    if sentence in sentence_aspect_term_dict:
        sentence_aspect_term_dict[sentence].append(aspect_term)
    else:
        sentence_aspect_term_dict[sentence] = [aspect_term]

In [10]:
print(len(sentence_aspect_term_dict))
sentence_aspect_term_dict[list(sentence_aspect_term_dict.keys())[0]]

3501


[{'aspect_term': 'cord', 'from': 41, 'to': 45, 'polarity': 0},
 {'aspect_term': 'battery life', 'from': 74, 'to': 86, 'polarity': 1}]

In [11]:
nlp = spacy.load("en_core_web_sm")

def tokenize_sentences(sentence_aspect_term_dict, task1=True):
    conflict_polarity = 3

    final_data = []

    num = 1
    for sentence, aspect_terms in sentence_aspect_term_dict.items():

        doc = nlp(sentence)

        data = []

        curr_idx = 0
        AT_tag = "NAT"
        polarity = 0
        for w in doc:

            # check if the current word is an aspect term
            # first get the current index
            curr_idx = sentence.find(w.text, curr_idx)
    #         print('w: {}; curr_idx: {}'.format(w, curr_idx))

            # second, check if the current index matches any of the aspect term
            AT_tag = "NAT"
            polarity = 0
            for t in aspect_terms:
                if t['from'] <= curr_idx and curr_idx < t['to']: # the curr_idx is within this aspect term range
                    polarity = t['polarity']
                    AT_tag = "AT"
                    if task1 == False and polarity == conflict_polarity:
                        AT_tag = "NAT"
                    break

            data.append(('s_'+str(num), w.text, w.pos_, AT_tag, polarity))

            curr_idx += len(w)

        final_data.extend(data)

        num += 1
        
    return final_data
        

In [12]:
task1_final_data = tokenize_sentences(sentence_aspect_term_dict, task1=True)
task2_final_data = tokenize_sentences(sentence_aspect_term_dict, task1=False)


In [13]:
task1_final_data[:40]

[('s_1', 'I', 'PRON', 'NAT', 0),
 ('s_1', 'charge', 'VERB', 'NAT', 0),
 ('s_1', 'it', 'PRON', 'NAT', 0),
 ('s_1', 'at', 'ADP', 'NAT', 0),
 ('s_1', 'night', 'NOUN', 'NAT', 0),
 ('s_1', 'and', 'CCONJ', 'NAT', 0),
 ('s_1', 'skip', 'VERB', 'NAT', 0),
 ('s_1', 'taking', 'VERB', 'NAT', 0),
 ('s_1', 'the', 'DET', 'NAT', 0),
 ('s_1', 'cord', 'NOUN', 'AT', 0),
 ('s_1', 'with', 'ADP', 'NAT', 0),
 ('s_1', 'me', 'PRON', 'NAT', 0),
 ('s_1', 'because', 'SCONJ', 'NAT', 0),
 ('s_1', 'of', 'ADP', 'NAT', 0),
 ('s_1', 'the', 'DET', 'NAT', 0),
 ('s_1', 'good', 'ADJ', 'NAT', 0),
 ('s_1', 'battery', 'NOUN', 'AT', 1),
 ('s_1', 'life', 'NOUN', 'AT', 1),
 ('s_1', '.', 'PUNCT', 'NAT', 0),
 ('s_2', 'The', 'DET', 'NAT', 0),
 ('s_2', 'tech', 'NOUN', 'AT', 0),
 ('s_2', 'guy', 'NOUN', 'AT', 0),
 ('s_2', 'then', 'ADV', 'NAT', 0),
 ('s_2', 'said', 'VERB', 'NAT', 0),
 ('s_2', 'the', 'DET', 'NAT', 0),
 ('s_2', 'service', 'NOUN', 'AT', -1),
 ('s_2', 'center', 'NOUN', 'AT', -1),
 ('s_2', 'does', 'AUX', 'NAT', 0),
 ('s_2',

In [14]:
df1 = pd.DataFrame(task1_final_data, columns=['num', 'text', 'pos', 'aspect_tag', 'polarity'])
df2 = pd.DataFrame(task2_final_data, columns=['num', 'text', 'pos', 'aspect_tag', 'polarity'])
# Save the dataframe to csv
df1.to_csv('data/restaurants_laptop_train_with_pos_task1.csv', index=False)
df2.to_csv('data/restaurants_laptop_train_with_pos_task2.csv', index=False)