# Create Vector Representations for Review Text

In [99]:
import os
import re
import urllib
import tarfile
import sqlite3
import spacy
import logging
import sys
import json

import pandas as pd

import gensim
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

import spacy

## Setup Logging

In [90]:
# Add logging at the info level
if "_LOG_SETUP" not in globals():
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)

    log_handler = logging.StreamHandler(sys.stdout)
    log_handler.setLevel(logging.INFO)
    log_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    log_handler.setFormatter(log_formatter)
    root_logger.addHandler(log_handler)
    _LOG_SETUP = True

## Load Data

In [2]:
db = sqlite3.connect("var/reviews.db")

In [4]:
reviews = pd.read_sql("select * from reviews", db)

In [5]:
reviews.sample(5)

Unnamed: 0,id,titleId,dataset,class,datasetclassId,rating,review
96746,96747,tt0110273,train,unsup,46821,0,It's time out from your life you will not get ...
97811,97812,tt0188527,train,unsup,47804,0,I believe that this was supposed to be shockin...
6880,6881,tt0062804,test,neg,6815,1,"Like another reviewer, I really wanted to like..."
96267,96268,tt0290459,train,unsup,46276,0,"Over the top drama. A very boring movie, where..."
27817,27818,tt0114658,train,neg,2942,1,This is hands down the worst movie of all time...


## Get Words

In [None]:
emb_tag_re = re.compile(r'<(br)\s*/?\s*>')

def replace_tags(text):
    return emb_tag_re.sub('\n', text)

In [81]:
word_split_re = re.compile(r'[,\.\s\?\!":;\(\)\[\]]')

def get_words_simple(text):
    words = []
    for w in word_split_re.split(replace_tags(text)):
        w = w.lower().strip()
        if len(w) > 0:
            words.append(w)
    return words

In [82]:
reviews['words'] = reviews.review.apply(get_words_simple)

In [84]:
reviews.sample(5)

Unnamed: 0,id,titleId,dataset,class,datasetclassId,rating,review,words
79613,79614,tt0372237,train,unsup,29650,0,I cried when I thought about all the money tha...,"[i, cried, when, i, thought, about, all, the, ..."
38940,38941,tt0065854,train,pos,1503,9,(SPOILERS included) This film surely is the be...,"[spoilers, included, this, film, surely, is, t..."
74580,74581,tt0201338,train,unsup,24699,0,This movie has only just come out on DVD in Br...,"[this, movie, has, only, just, come, out, on, ..."
38732,38733,tt0049432,train,pos,1199,10,This is the most compelling and excellent perf...,"[this, is, the, most, compelling, and, excelle..."
236,237,tt0018294,test,neg,147,4,"I usually enjoy watching Laurel and Hardy, but...","[i, usually, enjoy, watching, laurel, and, har..."


## Train doc2vec

In [85]:
docs = []
for _, row in reviews.iterrows():
    docs.append(TaggedDocument(row.words, [f"{row.id}"]))

%%time
doc_model = Doc2Vec(docs, vector_size=50, epochs=10)

In [92]:
os.makedirs("var/models", exist_ok=True)
doc_model.save("var/models/review_model.d2v")

2022-03-27 12:45:09,498 - gensim.utils - INFO - Doc2Vec lifecycle event {'fname_or_handle': 'var/models/review_model.d2v', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-27T12:45:09.498104', 'gensim': '4.1.2', 'python': '3.7.6 (tags/v3.7.6:43364a7ae0, Dec 19 2019, 00:42:30) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}
2022-03-27 12:45:09,499 - gensim.utils - INFO - not storing attribute cum_table
2022-03-27 12:45:09,640 - gensim.utils - INFO - saved var/models/review_model.d2v


## Get Embeddings

In [94]:
reviews['review_embedding'] = reviews.words.apply(lambda words: doc_model.infer_vector(words))

## Save embeddings by id to SQL

In [107]:
review_saves = reviews[['id', 'review_embedding']].copy()

In [100]:
#review_saves['words'] = review_saves.words.apply(json.dumps)

In [108]:
review_saves['review_embedding'] = review_saves.review_embedding.apply(lambda emb: json.dumps([float(e) for e in emb]))

In [109]:
review_saves.sample(5)

Unnamed: 0,id,review_embedding
86272,86273,"[-0.7933663725852966, 0.7390387654304504, -0.1..."
56820,56821,"[-0.5106093287467957, 0.3753964900970459, 0.06..."
43783,43784,"[-1.0685176849365234, 0.1314537078142166, 0.05..."
74359,74360,"[-1.2760909795761108, 1.0327000617980957, -1.4..."
73144,73145,"[-0.26133278012275696, 0.05307111516594887, -0..."


In [110]:
db = sqlite3.connect("var/reviews.db")

In [111]:
review_saves.to_sql('review_embeddings', if_exists='replace', index=False, chunksize=5000, con=db)

In [112]:
db.commit()

In [113]:
db.execute("create unique index i_review_embeddings_pk on review_embeddings (id)")
db.commit()

In [114]:
db.close()