In [None]:
"""Simple classification using sentence embedding models. Following are the steps:
1. Load a model (e.g. doc2vec)
2. Load keywords for each class
3. measure distance between each class and a sentence. Select argmin. """ 
#imports
from absl import logging

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import tensorflow_hub as hub
import sentencepiece as spm
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import ssl
ssl._create_default_https_context = ssl._create_unverified_context



In [None]:
module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-lite/2")

In [None]:
input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])
encodings = module(
    inputs=dict(
        values=input_placeholder.values,
        indices=input_placeholder.indices,
        dense_shape=input_placeholder.dense_shape))

In [None]:
with tf.Session() as sess:
  spm_path = sess.run(module(signature="spm_path"))

sp = spm.SentencePieceProcessor()
with tf.io.gfile.GFile(spm_path, mode="rb") as f:
  sp.LoadFromSerializedProto(f.read())
print("SentencePiece model loaded at {}.".format(spm_path))

In [None]:
def process_to_IDs_in_sparse_format(sp, sentences):
  # An utility method that processes sentences with the sentence piece processor
  # 'sp' and returns the results in tf.SparseTensor-similar format:
  # (values, indices, dense_shape)
  ids = [sp.EncodeAsIds(x) for x in sentences]
  max_len = max(len(x) for x in ids)
  dense_shape=(len(ids), max_len)
  values=[item for sublist in ids for item in sublist]
  indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]
  return (values, indices, dense_shape)

In [None]:
# Compute a representation for each message, showing various lengths supported.
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, messages)

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(
      encodings,
      feed_dict={input_placeholder.values: values,
                input_placeholder.indices: indices,
                input_placeholder.dense_shape: dense_shape})

  for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
    print("Message: {}".format(messages[i]))
    print("Embedding size: {}".format(len(message_embedding)))
    message_embedding_snippet = ", ".join(
        (str(x) for x in message_embedding[:3]))
    print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

In [None]:
def get_embeddings(messages):
    """uses USE to get embeddings of messages.
    message is array of strings. String could be a word or a sentence."""
    values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, messages)
    message_embeddings = None
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(
        encodings,
        feed_dict=
        {input_placeholder.values: values,
                    input_placeholder.indices: indices,
                    input_placeholder.dense_shape: dense_shape})
    return np.array(message_embeddings)

In [None]:
embeddings = get_embeddings(['clean', 'dirty'])
print(len(embeddings))

In [None]:
def create_class_keywords():
    """generates class keywords and class embeddings"""
    #keywords could be generated automatically using word_embedding models
    keywords = {'staff':['staff', 'reception', 'waiter', 'housekeeping'], 
                'location':['location', 'area', 'neighborhood', 'neighbourhood', 'near', 'far'], 
                'service':['service', ' helpful', ' facility'], #redundant with staff
                'room':['room', 'bedroom', 'bed', 'floor'], 
                'sleep_quality':['sleep quality', 'sleep', 'insomnia', 'noisy', 'noise', 'bed',                                         'pillows'], 
                'swimming_pool':['swimming', 'pool', 'jacuzzi', 'pools'], 
                'value_for_money':['expensive', 'cheap', 'cost', 'price', 'value for money'], 
                'cleanliness':['cleanliness', 'clean', 'bathroom', 'toilet', 'dirty', 'spotless',                                       'sanitary', 'unclean', 'tidy']}

    embeddings = {}
    for key in keywords:
        embeddings[key] = get_embeddings(keywords[key])
    
    return keywords, embeddings
 



In [42]:
class_keywords, class_embeddings = create_class_keywords()

In [43]:
def cos_similarity(vector1, vector2):
  cos_sim = np.dot(vector1, vector2)/(np.linalg.norm(vector1)*np.linalg.norm(vector2))
  return cos_sim

In [74]:
import nltk
def review_aspects(review, class_embeddings):
    #sentence tokenization
    sentences = nltk.tokenize.sent_tokenize(review)
    
    sentence_embeddings = get_embeddings(sentences)
    sentence_class = {}
    for i in range(len(sentences)):
        class_distances = {}
        for key in class_embeddings:
            min_dist = np.Infinity
            for keyword_embed in class_embeddings[key]:
                dist=abs(cos_similarity(keyword_embed, sentence_embeddings[i]))
                if min_dist > dist:
                    min_dist = dist
            class_distances[key] = min_dist

        #the right approach is actually outlier detection, use 3 sigma rule
        #mu  = mean of the data
        #std = standard deviation of the data
        #IF abs(x-mu) > 3*std  THEN  x is outlier
        mu = np.mean(list(class_distances.values()))
        print(list(class_distances.values()))
        # print('mu', mu)
        std = np.std(list(class_distances.values()))
        sentence_class[i] = None
        for key in class_distances:
            x = class_distances[key]
            if x-mu < 0:
                if abs(x-mu) > 2*std:
                    sentence_class[i] = key
                    break #wrong logic, should be minimum of whichever options pass the outlier test
    return sentences, sentence_class

In [75]:
review= "Stumbled across this restaurant while walking near Antibes. Was initially quiet but steady stream of customers created a friendly atmosphere. I ordered steak, which was perfect and the same dessert twice as it was to die for. The staff were fantastic and made the evening great and it didn’t feel like I was dining alone. Thanks Matthieu." 
review_aspects(review, class_embeddings)


[0.16856785, 0.18015818, 0.14343667, 0.15319811, 0.061022647, 0.12810887, 0.10749574, 0.1624117]
[0.21744406, 0.18552805, 0.123958535, 0.07767787, 0.032654315, 0.03220264, 0.17714652, 0.067323595]
[0.09785338, 0.062012054, 0.048544865, 0.06243664, 0.04928684, 0.07428492, 0.08454848, 0.094223544]
[0.13947722, 0.0406598, 0.043107465, 0.066168755, 0.016844431, 0.0701942, 0.018802041, 0.020055488]
[0.10922802, 0.05728433, 0.13644898, 0.10538236, 0.06376191, 0.029890656, 0.13340549, 0.049658675]


(['Stumbled across this restaurant while walking near Antibes.',
  'Was initially quiet but steady stream of customers created a friendly atmosphere.',
  'I ordered steak, which was perfect and the same dessert twice as it was to die for.',
  'The staff were fantastic and made the evening great and it didn’t feel like I was dining alone.',
  'Thanks Matthieu.'],
 {0: 'sleep_quality', 1: None, 2: None, 3: None, 4: None})

In [None]:
#A suggestion:
#1. Train fasttext embeddings on my data.
#2. Preprocess the data so that only noun phrases are remaining. 
#3. Compare the distance of class keywords to the noun phrases in the sentence
#4. Use the same logic as above to select the class 