# Read Syntetic reviews of fashions 

In [18]:
import pandas as pd

reviews = pd.read_csv('../Syntetic_reviews/reviews_all.csv')

# Useful classes and functions

In [19]:
import numpy as np

def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
    return distance.cosine(v1, v2)

class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']

        aux = aux.reset_index().query('index<10 or similarity<0.6').query('similarity<0.7')[['index','topic']].groupby(['topic']).count()
        
        aux['index2'] = aux['index']/aux['index'].sum()
        
        
        
        return  list(aux.query('index2>0.4 and index>=4').index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux



# Create V Database

In [20]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy

from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer

# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")
vector_db = VectorDatabase(nlp, model)

for index, row in reviews.iterrows():
    vector_db.insert(row['Review'],row['Polarity'],row['Topic'])



Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
CPU times: user 36.1 s, sys: 499 ms, total: 36.6 s
Wall time: 6.29 s


# Test VDB

In [29]:
str = '''

I hung dry them, but the edges have already started deteriorating. 
Amazing Design.
The fit is perfect.
Awful Customer Service


'''

aux = vector_db.long_search(str)

aux


Unnamed: 0,topic,review,sub_review
0,Longevity,"\n\nI hung dry them, but the edges have alread...","i hung dry them , but the edges have already s..."
0,Design,"\n\nI hung dry them, but the edges have alread...",amazing design .
0,Fit and Comfort,"\n\nI hung dry them, but the edges have alread...",the fit is perfect .
0,Customer Support,"\n\nI hung dry them, but the edges have alread...",awful customer service


# Test VDB with some actual reviews from our dataset

In [9]:

from google.cloud import bigquery
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../sa.json"


client = bigquery.Client()


sql = '''

SELECT reviewText,overall,asin
from `factored.raw_reviews`
WHERE asin = 'B00HDZIT0S'
'''

df = client.query(sql).result().to_dataframe()



In [17]:
%%time 
ans= []
for index, row in df.iterrows():
    #print(index)
    review = row['reviewText']
    if review is None:
        review = ''
    if (len(review)>0):
        aux = vector_db.long_search(review)
        if aux is not None:
            aux['stars'] = row['overall']
            ans.append(aux)

ans = pd.concat(ans)


df['topic']= 'overall'
overall = df[['topic','overall']].groupby(['topic']).mean()
overall.columns = ['stars']


topics = ans[['topic','stars']].groupby(['topic']).mean()


pd.concat([overall,topics])

CPU times: user 59 s, sys: 245 ms, total: 59.3 s
Wall time: 10.5 s


Unnamed: 0_level_0,stars
topic,Unnamed: 1_level_1
overall,3.809129
Fit and Comfort,3.980198
Material and Quality,3.125
Versatility,4.266667
longevity,2.555556
