In [1]:
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer
import numpy as np
import spacy
import pandas as pd


#model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
#model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')

model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")




  from .autonotebook import tqdm as notebook_tqdm


Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx


In [2]:
class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model

        

    def split_sentences(self, text):
        text = text.replace(',','.').replace('and','.').replace('but','.')
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts#text.replace(',','.').replace('but','.').split('.')


    def insert(self, sentence: str, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        #embeddings = list(model.encode([sentence])[0])
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        #query_vector = list(model.encode([query])[0])
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        
        aux.columns = ['index_db','text','similarity','topic']
                
        return  aux

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            topics.append(topics_this)
            

        topics = pd.concat(topics)[['similarity','topic']].groupby(['topic']).min().reset_index()

        
        aux = pd.DataFrame(list(topics.similarity)).transpose()
        aux.columns = list(topics.topic)
        

        return  aux



In [19]:

import spacy
import pandas as pd



questions = {
'Fit' : 'Does it fit well?',
'Comfortable' : 'Is it comfortable?',
'Material Quality' : '''How is the material's quality?''',
'Price and Value' : 'How is the price',
'Fiability':'Does it look like the pictures?',
'Ease of use':'Is it easy to use?',
'Durability':'How is the durability?'
}

vector_db = VectorDatabase(nlp, model)
print('uploading vectors to DB')

for i in questions:
    vector_db.insert(questions[i],i)

uploading vectors to DB


# Load Real Reviews

In [20]:

from google.cloud import bigquery
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../sa.json"


client = bigquery.Client()

sql = '''
SELECT DISTINCT reviewText
FROM `plenary-stacker-393921.factored.raw_reviews` 
WHERE asin = 'B00M4NF9H0' AND reviewText IS NOT NULL
'''

df = client.query(sql).result().to_dataframe()

In [21]:
from tqdm import tqdm
all = []
for index, row in tqdm(df.head(1000).iterrows(), total=1000):
    ans = vector_db.long_search(row['reviewText'])
    ans['text'] = row['reviewText']
    all.append(ans)

all = pd.concat(all)

100%|█████████████████████████████████████████████████████████████████████████| 1000/1000 [01:38<00:00, 10.14it/s]


# Check results

In [23]:
for index, row in all.sort_values(['Durability']).head(7).iterrows():
    print(row['text'])
    print('--------')

Super comfortable shoes that are easy to take on and off and also support a good cause through the one for one program. Very durable and long lasting. Definitely one of my favorite pairs of casual shoes.
--------
I love the comfort of this shoes,  I would definitely recommend it to my friends and family. I chose this rating because of the New look and the durability of the shoes.
--------
My husband is very hard to please in terms of slippers! He loves these. We will certainly purchase more of these in the future. Pricey, but durable.
--------
Good looking shoe, looks very durable and comfortable, but NOT for wide feet. Wide width is not offered and even choosing a bigger size will be too small.
--------
I needed another throw around type of shoe that I could just slide on. I grabbed these and am happy with them so far. The sizing does run big, I normally wear a 9.5 and I went down to an 8.5 for these (which is still a tad big, considering they do stretch). However, they are still very

In [24]:
for index, row in all.sort_values(['Material Quality']).head(7).iterrows():
    print(row['text'])
    print('--------')

These shoes did not fit like the other size 8 shoes TOMS I have purchased. I returned these shoes because they were too big. The quality of the material all seemed to be the same though.
--------
--------
Quality is not very good. Color quickly fades and they absolutely stink if they get wet.
--------
As usual I loved my Toms. They fit perfectly and are very comfortable. The material is as expected and the shipping was pretty quick too.
--------
I love these shoes....I would wear them all the time, BUT I have had them for only 2 months and one has a big hole in the heel and the other one is starting to develop a hole as well!  I have a second pair because I was so excited about them, and the heels on those are wearing out as well.  Obviously, the quality is lacking... I am so sad, but don't feel like I should waste any more money on the many other colors I would love to have.
--------
ARE VERY HIGH QUALITY, THE COLOR IS PERFECT AND THEY FEEL SO GOOD!!! I CANT WAIT TO ANOTHER PAIR. LOVE

In [13]:
for index, row in all.sort_values(['Fiability']).head(7).iterrows():
    print(row['text'])
    print('--------')

Comfortable but feel a little tight across the top of my foot (I do have a high arch though). They look just like the photos and they are super cute!  I am hoping they will loosen up as I continue to wear them.
--------
My mom loved the Toms and it arrived quickly. It is exactly as pictured. Great buy no complaints and is true to size
--------
This is about my 6th Toms Shoe...they've been perfect. I always order a size 6 and they always fit perfectly; the most comfortable shoe I own. This pair, unfortunately, are snug and hurt my feet after only an hour or so in. I'm going to try to stretch them, as I have to wear them weekly for a gig. But it was disappointing that they didn't fit properly after years of ordering the same size. I just ordered a different pair a few months ago in size 6 that fit perfectly, so I don't think it's my feet :) Otherwise, they're beautiful, great price and look better than the picture.
--------
She's fit perfectly. Color and picture were the same. I am looki

In [14]:
for index, row in all.sort_values(['Price and Value']).head(7).iterrows():
    print(row['text'])
    print('--------')

These were promised as good shoes. They are. My purchase bought a pair in the third world which seems nice. They are good quality and are comfortable. Price is right
--------
I am doing my review on the shoes called Toms. These have both pros and cons for the merchandise.  I am not a huge fan of Toms, but that is just my opinion. I will give you reasons why I chose to dislike the item.

  The good part first: Toms create cute shoes. They make them likable and appealing to the eye of a customer. Also the fabric is light weight, causing the shoe not to be heavy. Now let's say you buy a pair of Toms and want to do some charity work as well; if you buy a pair of the shoes, then the company will send a pair for free to a kid in need.

The Toms cons outweigh the pros. For example, the item is really expensive, ranging from $30.00 to $100.00. If you step in a puddle or go to a water park or in the rain with the shoes on, they will shrink against your feet. Let's say that you're wearing Toms a

In [15]:
for index, row in all.sort_values(['Fit']).head(7).iterrows():
    print(row['text'])
    print('--------')

The TOMs run big. I am usually a 10 so I ordered a 9 and it fits perfectly. Nice shoe. Color is as expected. Very feminine.
--------
I ordered this for.my husband and it fits perfect
--------
Fits well. Very comfy. Would recommend it to everyone. I would love it in another color when I can afford another.
--------
i like the color and it fits ok. however in my previous purchases from toms store, there was always a toms bag in the bix, this time there is none, i wonder why.. are they not genuine toms??
--------
so far so good. fits great. will know how well they hold up in many months from now. too soon to tell.
--------
Love these shoes. Very comfortable. I'm normally an 8.5 but bought an 8. Fits great.
--------
I got my first pair in the ash gray color, which goes with mostly everything except brown themed outfits. So I got the navy to fill in the gaps ;) The salesperson at Nordstrom told me that they run big, which is right - I'm usually a 9 and got the 8.5, which was a little tight 

In [18]:
for index, row in all.sort_values(['Ease of use']).head(10).iterrows():
    print(row['text'])
    print('--------')

Easy for travel.  Comfortable.
--------
I ordered the size 11.5, having previously ordered a size 11.5. Yes, this is my second pair in just three months, but not because they've worn out, but because they're comfortable and easy to spend a day in. However, the size is just a touch larger than my previous pair of the same size - so, there is a slight discrepancy in the canvas fitment.
--------
This are my very first pair of Toms and I simply love them, they are easy to slip on in the morning, easy to keep clean and go well with every outfit! I've been wearing them everyday for nearly a month and knock on wood but they have held up great, since I walk everywhere I go. I am already saving up to buy a new pair!
--------
Received the shoes as ordered.  The shoes were extremely narrow and difficult to put on.  I also didn't think that they were very good looking for a man to wear, my opinion obviously.  I returned them easily and was promptly reimbursed.
--------
Love these, comfy and versat

# On a Brand

In [33]:
brand = 'Casio'
sql = f"""
SELECT asin, reviewText, overall,summary,reviewerID
FROM `factored.raw_reviews`
inner join `factored.metadata` using(asin)
where brand = '{brand}'
"""

df = client.query(sql).result().to_dataframe()

## WHERE THE MAGIC HAPPENS
# We iterate through the reviews
# On each review we try to identify the topic i.e (quality, longevity, luxury): the set of topics depends on the sample_reviews file (it depends on the brand)
# Every Review Topic (+ overall) creates a row in the database

all_reviews = []
for index, row in df.head(10).iterrows():
    print(index)
    if (row["reviewText"] is not None) & (row["reviewText"] != ""):
        topics_score = vector_db.long_search(row["reviewText"])
        
        

        reviews = {'asin':row["asin"],
                   'reviewText': row["reviewText"],
                   'overall': row["overall"],
                   "summary": row["summary"], 
                   "reviewerID":row["reviewerID"]}
        
        reviews = pd.DataFrame(reviews, index = [0])
        reviews = pd.concat([reviews,topics_score],axis=1)
        all_reviews.append(reviews)

reviews = pd.concat(all_reviews)

0
1
2
3
4
5
6
7
8
9


In [34]:
reviews

Unnamed: 0,asin,reviewText,overall,summary,reviewerID,Comfortable,Durability,Ease of use,Fiability,Fit,Material Quality,Price and Value
0,B000GB1R5U,It started off as a good watch and I only had ...,2.0,Good watch until it stopped working,A2P287GSQUR1KT,0.807008,0.892598,0.891724,0.885511,0.832114,0.924287,0.903611
0,B00KX163DY,Hard to tell time in the dark even with the LE...,2.0,Not Sure?,A3D95ED8I38N7T,0.787186,0.801621,0.861538,0.48259,0.835817,0.719913,0.785228
0,B00065FWR0,"watch stop working 21 days after receiving It,...",2.0,bad watch,A6OAI6WIIVSA7,0.847448,0.890379,0.807743,0.904287,0.857882,0.931049,0.827769
0,B00J5QR062,"Well, I'll have to divide my review in two, sp...",2.0,Bed delivery.,AMVP26IYYSKLQ,0.750805,0.769509,0.779876,0.616634,0.740376,0.780229,0.642906
0,B0017U1MJU,I saw the watch in an Outside Magazine and was...,3.0,Casio Pathfinder PAW-1500-1V Review,AMJESI07WEYQH,0.701744,0.704499,0.797868,0.786906,0.715634,0.693966,0.671016
0,B00FBO04V0,Hoped it would've been bigger but it's a lady'...,3.0,It is a good product but wouldn't recommend to...,A1KPGVWCWRB1C3,0.75243,0.745872,0.608912,0.812103,0.723596,0.626774,0.626429
0,B00134JNJG,Great watch very light weight and thin very co...,4.0,Casio Men's GW5600J-1,A3DOJPJWLCKVAH,0.518479,0.761293,0.844521,0.875044,0.843796,0.682804,0.75246
0,B00154GSQA,Overall this is an exceptional calculator for ...,4.0,Algebra-Precalculus,A1UC0YEW5P5R9Z,0.769512,0.745003,0.562648,0.756528,0.770538,0.791885,0.590462
0,B00I9I3OFS,I have been wearing this watch to the beach fo...,4.0,... the beach for a few months now and I like ...,A3KUJ3LF54L40W,0.689057,0.615953,0.760997,0.777987,0.72981,0.777321,0.853957
0,B000GAYQJK,"I got mine, and its just as described. No surp...",4.0,and that's good enough.,A1LF9QAK0VO0LS,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477


Unnamed: 0,asin,reviewText,overall,summary,reviewerID,Comfortable,Durability,Ease of use,Fiability,Fit,Material Quality,Price and Value
0,B000GB1R5U,It started off as a good watch and I only had ...,2.0,Good watch until it stopped working,A2P287GSQUR1KT,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
0,B00KX163DY,Hard to tell time in the dark even with the LE...,2.0,Not Sure?,A3D95ED8I38N7T,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
0,B00065FWR0,"watch stop working 21 days after receiving It,...",2.0,bad watch,A6OAI6WIIVSA7,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
0,B00J5QR062,"Well, I'll have to divide my review in two, sp...",2.0,Bed delivery.,AMVP26IYYSKLQ,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
0,B0017U1MJU,I saw the watch in an Outside Magazine and was...,3.0,Casio Pathfinder PAW-1500-1V Review,AMJESI07WEYQH,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
0,B00FBO04V0,Hoped it would've been bigger but it's a lady'...,3.0,It is a good product but wouldn't recommend to...,A1KPGVWCWRB1C3,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
0,B00134JNJG,Great watch very light weight and thin very co...,4.0,Casio Men's GW5600J-1,A3DOJPJWLCKVAH,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
0,B00154GSQA,Overall this is an exceptional calculator for ...,4.0,Algebra-Precalculus,A1UC0YEW5P5R9Z,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
0,B00I9I3OFS,I have been wearing this watch to the beach fo...,4.0,... the beach for a few months now and I like ...,A3KUJ3LF54L40W,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
0,B000GAYQJK,"I got mine, and its just as described. No surp...",4.0,and that's good enough.,A1LF9QAK0VO0LS,0.811632,0.836999,0.883158,0.775925,0.852737,0.892539,0.827477
