In [3]:
import pandas as pd
import spacy
import re

The CSV datasets stored in `preprocessed_datasets` are processed further so that the data stored in CSV files is embedded.

### Tokenization for the english language datasets.

Defining file locations

In [4]:
dircs = "./preprocessed_datasets/sentences/"
dirco = "./preprocessed_datasets/opinions/"

files = [
    "english-laptops",
    "english-restaurants"
]

csv = ".csv"

Functions to perform padding and tokenization

In [5]:
def remove_non_alpha(sentence):
    '''Returns sentence obtained after removing all 
    non-alpha values'''

    cleaned_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    return cleaned_sentence

def pad_sentences(sentence, l):
    '''Returns sentence obtained after padding given sentence to length l, 
    truncates sentences longer than l'''

    words = sentence.split()
    n_words = len(words)

    if n_words > l:
        sentence = words[0]
        for i in range(1, l):
            sentence += " " + words[i]
    while n_words < l:
        sentence += " #"
        n_words += 1
    return sentence

def tokenize_review(review, sentence_length, review_length):
    '''Takes all sentences in a review as a list,
    returns a padded and tokenized review,
    truncates sentences longer than sentence_length and reviews longer than review_length'''

    nlp = spacy.load("en_core_web_sm")
    tokenized = []
    empty_sentence = nlp(pad_sentences('#', sentence_length))

    for sentence in review:
        if len(tokenized) >= review_length:
            break
        sentence = nlp(pad_sentences(remove_non_alpha(sentence), sentence_length))
        tokenized.append(sentence)
    while len(tokenized) < review_length:
        tokenized.append(empty_sentence)

    return tokenized

Finding review blocks in files and creating tokenized review elements

In [6]:
def create_review_blocks(path, sentence_length, review_length):
    data = pd.read_csv(path)
    num_sentences = len(data["text"])
    reviews = []

    prev_rid = data["rid"][0]
    review = []
    for i in range(num_sentences):
        if data["rid"][i] != prev_rid:
            reviews.append(tokenize_review(review, sentence_length, review_length))
            review = [data["text"][i]]
        else:
            review.append(data["text"][i])
        prev_rid = data["rid"][i]
    reviews.append(tokenize_review(review, sentence_length, review_length))

    return reviews

sentence_length = 20
review_length = 10
english_laptops_reviews = create_review_blocks(dircs + files[0] + csv, sentence_length, review_length)
english_restaurants_reviews = create_review_blocks(dircs + files[1] + csv, sentence_length, review_length)

### Creating embeddings

In [7]:
import fasttext

In [14]:
model = fasttext.train_unsupervised("preprocessed_datasets/sentences/english-restaurants.csv")
print(model.words)

Read 0M words
Number of words:  612
Number of labels: 0


['</s>', 'the', 'and', 'a', 'is', 'to', 'was', 'of', 'for', 'in', 'I', 'it', 'food', 'but', 'you', 'this', 'not', 'that', 'with', 'on', 'place', 'we', 'are', 'have', 'were', 'my', 'had', 'at', 'great', 'good', 'very', 'so', 'be', 'service', 'they', 'go', 'as', 'restaurant', 'all', 'like', 'there', 'or', 'would', '-', 'which', 'by', 'our', 'your', 'just', 'one', 'here', 'from', 'been', 'if', 'an', 'get', 'about', 'their', 'more', 'out', 'never', 'can', 'than', 'will', 'back', 'no', 'best', 'always', 'us', "don't", 'has', 'only', 'when', 'really', 'time', 'what', 'me', 'staff', 'some', "it's", 'pizza', 'recommend', 'make', 'other', 'worth', 'even', 'food,', 'nice', 'i', 'try', 'because', 'who', 'wine', 'delicious', 'too', 'little', 'up', 'sushi', 'got', 'eat', 'going', 'went', 'wait', '"', 'well', 'many', 'made', 'over', 'much', 'do', 'restaurants', 'last', "can't", 'most', 'ordered', 'it.', 'dinner', 'dishes', 'times', 'am', 'say', 'New', 'menu', 'ever', 'any', 'good,', "didn't", 'order

Progress: 100.0% words/sec/thread:  185981 lr:  0.000000 avg.loss:  2.972233 ETA:   0h 0m 0s


In [13]:
model.save_model("preprocessed_datasets/models/renglish-restaurants.bin")

word_to_embedding = {}
embedding_to_word = {}
word_vectors = []

i = 0
for word in model.words:
    vector = model.get_word_vector(word)
    word_vectors.append(vector)
    word_to_embedding[word] = i
    embedding_to_word["{i}"] = word
    i += 1

## Final preprocessing and embedding

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
from torch.utils.data import Dataset
import xml
import xml.etree.ElementTree as ET
from torchtext.data import get_tokenizer
from functools import reduce
import torchtext

def parse_review(review: xml.etree.ElementTree.Element) -> list:
            return list(map(lambda sen: (sen.find('text').text, 
                                         tuple(map(lambda op: (op.attrib["category"], 
                                                               op.attrib["polarity"]), 
                                                   sen.find("Opinions").findall('Opinion')) # for all opinions
                                              )),
                            list(filter(lambda sen: sen.find("Opinions"), review.find("sentences").findall("sentence"))))) # only sentences w/ opinion

class ReviewDataset(Dataset):     
    def __init__(self, path: str): 
        tree = ET.parse(path)
        root = tree.getroot()
        self.reviews = list(map(parse_review, root.findall("Review")))
        self.tokenizer = get_tokenizer("basic_english")
        self.vocab = torchtext.vocab.build_vocab_from_iterator(list(map(lambda review: reduce(lambda x, y: x.union(y),
                                                                                         list(map(lambda sen: self.tokenizer(sen[0]), 
                                                                                                  review)), 
                                                                                         set()), 
                                                                        self.reviews)),
                                                  min_freq=1,
                                                  specials=["<unk>", "<pad>"])
        

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = {'label' : [], 'ids' : [], 'aspects' : [], 'aspect_labels' : []}
        for sentence in self.reviews[idx]:
            # print(sentence)
            sentiment_count = 0
            for opinion in sentence[1]:
                aspect, sentiment = opinion
                if (sentiment == 'positive'):
                    sentiment_count += 1
                    # review['label'].append(1)
                else:
                     sentiment_count -= 1
            if sentiment_count > 0:
                review['label'].append(1)
            else:
                review['label'].append(0)
            tokenized_sentence = self.tokenizer(sentence[0])
            tokens_to_idx = []
            for token in tokenized_sentence:
                tokens_to_idx.append(self.vocab[token])
            while (len(tokens_to_idx) < 78):
                tokens_to_idx.append(self.vocab["<pad>"])
            review['ids'].append(tokens_to_idx)
            review['aspects'].append([aspect])
        return review

english = ReviewDataset("./datasets/english-restaurants.xml")


import torchtext as text
pretrained_vectors = text.vocab.GloVe(name='6B', dim=50)
pretrained_embedding = pretrained_vectors.get_vecs_by_tokens(english.vocab.get_itos())