# Feature engineering

### Libraries

In [1]:
# import libraries

import re

import pandas as pd
import numpy as np

from gensim.models import KeyedVectors
from scipy import stats
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Data

In [2]:
# load review sentiment data

review_df = pd.read_csv('data/review_sentiment.csv')

review_df

Unnamed: 0,review_id,text,sentiment
0,NvusujU9_5pIUbn9SZ6hMA,Stopped by to munch a burger during today's Se...,1
1,vHOeBa7aMA_na4rfS2Db5A,"Yelp doesn't allow to leave 0 star review, so ...",-1
2,hG9RTxxivb0ZXzEk4JXTXA,I find it hard to believe there are so many pe...,-1
3,zIVkwgahZjOneChZFUYY4g,Love this place! Almost all of their menu item...,1
4,DLczAuvMAlAnY5EeDGhTVg,Excellent customer service. I wish I could ren...,1
...,...,...,...
63446,OgoBp7fbXnLSKvsQb4O_tw,"I really loved the food and service. I mean, t...",1
63447,Q7e8EtZMmdknDrQE7huMoQ,Their Grove location was the bomb. Delicious f...,1
63448,zzMW6zbsFaQMjoGu2bGVdA,A nice ean BBQ joint right across from some ne...,1
63449,scgoa60EvhW2Mz7JMqLYGw,The perfect Hookah bar. I'm not sure what they...,1


In [3]:
# load corpus data

corpus = open('data/corpus.txt', 'r').read()
corpus = corpus.split('\n')
corpus = corpus[:-1]

In [4]:
# see contents of corpus

for review in corpus[:5]:
    print('-' * 50)
    print(review)

--------------------------------------------------
stop munch burger today seahawk saint game place unsurprisingli pack good reason burger order fantast sat right next door get chanc get six feet place soon got meal place start get busier busier work way world shortest peopl maze get guess mean first person hear place go back might go say lunch tuesday less busi
--------------------------------------------------
yelp allow leav star review see one star wife move ny south california contact differ move compani sent initi email unit van line soon got email back virtual survey confirm ladi virtual survey meticul profession screen whole apart minut no one contact sent second email almost month ask everyth ok get quot need mention compani sent quot less hour sinc no one repli til today call direct phone left messag answer machin no one call back hope everyth ok no one hurt see reason explan avoid unprofession
--------------------------------------------------
find hard believ mani peopl low

### Dense embeddings

In [5]:
# load word2vec vectors

wv = KeyedVectors.load("embeddings/reviews_wv")

In [6]:
def text_to_vector(embeddings, text, sequence_len, strategy=None):
    '''
    Function to convert text to word embeddings
    '''
    tokens = text.split()
    vec = []
    n = 0
    i = 0
    while i < len(tokens) and n < sequence_len:
        try:
            vec.extend(embeddings.get_vector(tokens[i]))
            n += 1
        except KeyError:
            True
        finally:
            i += 1
    for _ in range(sequence_len - n):
        vec.extend(np.zeros(embeddings.vector_size,))
    if strategy == 'mean':
        vec = np.mean(vec, axis=0)
    elif strategy == 'max':
        vec = np.max(vec, axis=0)
    return vec

In [7]:
# corpus statistics

lens = [len(c.split()) for c in corpus]

print('Number of reviews:', len(corpus))
print('Minimum number of words:', np.min(lens))
print('Maximum number of words:', np.max(lens))
print('Average number of words:', np.mean(lens))
print('Standard deviation of words:', np.std(lens))
print('Mode of words:', stats.mode(lens))

Number of reviews: 63451
Minimum number of words: 1
Maximum number of words: 495
Average number of words: 54.13619958708295
Standard deviation of words: 49.504305404044175
Mode of words: ModeResult(mode=14, count=1238)


In [8]:
# convert corpus into dataset with appended embeddings representation

simple_corpus = []
for review in review_df['text']:
    review = re.sub('[^a-zA-Z]', ' ', review).lower()
    simple_corpus.append(review)

embeddings_corpus = []
word_limit = 50
for review in simple_corpus:
    embeddings_corpus.append(text_to_vector(wv, review, word_limit))

### Transformations

In [9]:
# BoW

bag_of_words = CountVectorizer()
features = bag_of_words.fit_transform(corpus)

sparse.save_npz('features/bag_of_words.npz', features)

features.shape

(63451, 31988)

In [10]:
# 1-hot encoding

one_hot = CountVectorizer(binary=True)
features = one_hot.fit_transform(corpus)

sparse.save_npz('features/one_hot.npz', features)

features.shape

(63451, 31988)

In [11]:
# N-grams

n_grams = CountVectorizer(ngram_range=(1, 2), max_features=15000)
features = n_grams.fit_transform(corpus)

sparse.save_npz('features/n_grams.npz', features)

features.shape

(63451, 15000)

In [12]:
# TF-IDF

tf_idf = TfidfVectorizer()
features = tf_idf.fit_transform(corpus)

sparse.save_npz('features/tf_idf.npz', features)

features.shape

(63451, 31988)

In [9]:
# Word2Vec

features = np.array(embeddings_corpus)

np.save('features/word2vec.npy', features)

features.shape

(63451, 7500)