# Preprocessing data for deep learning models

## Convert file to a suited format

Each review text is divided into three lines

- 1st line has aspect term replaced by a placeholder token 
- 2nd line has the actual aspect term
- 3rd line has the polarity (1, -1, 0)

Finally each review text is separated by an additional newline. 

In [1]:
def prepare_dl_input_file(raw_file, out_file):
    pass

In [2]:
def read_dl_input_file(domain, subset, year='2014'):
        
    fname =f"../data/processed/SemEval{year}/{domain}_{subset}_dl.txt"
    with open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        lines = f.readlines()
        
    return lines

## Create an embedding matrix 

In [3]:
def create_word_vec(word2idx, embed_dim):
    
    emb_file = f'../data/embeddings/glove.6B.{embed_dim}d.txt'

    n = len(word2idx)
    w2v = {}
    
    with open(emb_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in word2idx:
                w2v[word] = np.asarray(values[1:], dtype='float32')
                                
    return w2v

In [4]:
def create_embedding_matrix(word2idx, embed_dim):
    
    emb_file = f'../data/embeddings/glove.6B.{embed_dim}d.txt'

    n = len(word2idx)
    embedding_matrix = np.zeros((n + 1, embed_dim))
    
    with open(emb_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in word2idx:
                idx = word2idx[word]
                embedding_matrix[idx] = np.asarray(values[1:], dtype='float32')
                                
    return embedding_matrix

## Preprocessing functions

In [5]:
import os
import pickle
import numpy as np
import tensorflow as tf
import keras.backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using Theano backend.


In [35]:
import re

def decontracted(phrase):

    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


In [36]:
def split_reviews_by_aspects(dl_input_lines: list):
    
    reviews_raw = []
    reviews_raw_without_aspects = []
    reviews_left = []
    reviews_left_with_aspects = []
    reviews_right = []
    reviews_right_with_aspects = []
    aspects = []
    polarities = []

    for i in range(0, len(dl_input_lines), 3):
        review = decontracted(dl_input_lines[i])
        review_left, _, review_right = [s.lower().strip() for s in review.partition("$T$")]
        aspect = dl_input_lines[i+1].lower().strip()
        polarity = dl_input_lines[i+2].strip()

        review_raw = ' '.join([review_left, aspect, review_right])

        reviews_raw.append(review_raw)
        reviews_raw_without_aspects.append(review_left + " " + review_right)
        reviews_left.append(review_left)
        reviews_left_with_aspects.append(review_left + " " + aspect)
        reviews_right.append(review_right)
        reviews_right_with_aspects.append(aspect + " " + review_right)
        aspects.append(aspect)
        polarities.append(int(polarity))
        
        
    res = {
        'reviews_raw': reviews_raw,
        'reviews_raw_without_aspects': reviews_raw_without_aspects,
        'reviews_left': reviews_left,
        'reviews_left_with_aspects': reviews_left_with_aspects,
        'reviews_right': reviews_right,
        'reviews_right_with_aspects': reviews_right_with_aspects,
        'aspects': aspects,
        'polarities': polarities
    }
        
        
    return res

In [37]:
def create_sequence_data(texts, maxlen, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen, padding='post', truncating='post')

In [38]:
def prepare_data_for_dl(domain='restaurants', subset='train', embed_dim=50, 
                        max_input_len=70, max_aspect_len=5, num_classes=3, tokenizer=None):
    
    
    # Read the lines from the pre-formatted dl input file
    lines = read_dl_input_file(domain, subset)
    
    # now obtain the splitted reviews on the left and right side of the aspect
    spltd = split_reviews_by_aspects(lines)
    polarities = spltd.pop('polarities')
    
    # Tokenize 
    if subset == 'test':
        if tokenizer is None:
            raise ValueError('Provide a tokenizer fitted on the train data!')
        if max_input_len is None:
            raise ValueError('Provide a maximum input length for padding the input sequence!')
        if max_aspect_len is None:
            raise ValueError('Provide a maximum aspect length for padding the aspect terms!')
            
    elif subset == 'train':
        tokenizer = Tokenizer(lower=False)
        tokenizer.fit_on_texts(spltd['reviews_raw'])
        
        
    word2idx = tokenizer.word_index
    
    # Create sequence padded data of indices
    res = {}
    
    for k, v in spltd.items():
        if k == 'aspects':
            maxlen = max_aspect_len
        else:
            maxlen = max_input_len
            
        res[f'{k}_idx'] = create_sequence_data(v, maxlen, tokenizer)
        
    # one hot encode polarities
    res['polarity_ohe'] = to_categorical(polarities, num_classes)     
        
    if subset == 'test':
        return res

    res['embedding_matrix'] = create_embedding_matrix(word2idx, embed_dim)
    res['tokenizer'] = tokenizer


    return res