# Syntetic reviews of fashions 

In [1]:
import sys
from typing import List, Tuple
from collections import defaultdict

import spacy
import numpy as np
import pandas as pd
from scipy.spatial import distance
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer

sys.path.insert(0, '../')
from upload_data.utils import execute_query


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        #aux['index2'] = aux['index']/aux['index'].sum()

        
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.percentile(same_type_top_similarity,95)
        self.similar = np.mean(same_type_similarity)


#### Load models: 

Syntethic reviews, model and nlp preprocessing pipeline

In [2]:
reviews = pd.read_csv('../Syntetic_reviews/reviews_all.csv')
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)
nlp = spacy.load("en_core_web_lg")

Model found at: C:\Users\david/.cache\torch\sentence_transformers\sentence-transformers_all-MiniLM-L6-v2\quantized_true.onnx


#### Train vector base

In [3]:
vector_db = VectorDatabase(nlp, model)
print('uploading vectors to DB')
for _, row in reviews.iterrows():
    vector_db.insert(row['Review'],row['Polarity'],row['Topic'])

uploading vectors to DB


#### Extract real data and predict on these reviews

In [31]:
query = '''
SELECT DISTINCT reviewText
FROM `plenary-stacker-393921.factored.raw_reviews` TABLESAMPLE SYSTEM (5 PERCENT)
WHERE asin = 'B00M4NF9H0' AND reviewText IS NOT NULL
LIMIT 220
'''

val_reviews = execute_query(query)

In [38]:
guesses = []

for _, row in val_reviews.iterrows():
    review = row['reviewText']
    aux = vector_db.long_search(review)
    guess = []
    if aux is not None:
        guess = aux.topic.values
    guesses.append(guess)
    
val_reviews['guesses'] = guesses

In [55]:
val_reviews.loc[val_reviews['guesses'].apply(lambda x: str(x)) == '[]', 'reviewText'].to_excel('prueba.xlsx')

In [33]:
val_reviews.guesses.value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[]                                                          147
[Fit and Comfort]                                            46
[Longevity]                                                   4
[Material and Quality]                                        3
[Price and Value]                                             2
[Fit and Comfort, Fit and Comfort]                            1
[Fit and Comfort, Fit and Comfort, Fit and Comfort]           1
[Fit and Comfort, Fit and Comfort]                            1
[Fit and Comfort, Fit and Comfort]                            1
[Fit and Comfort, Fit and Comfort]                            1
[Price and Value, Fit and Comfort]                            1
[Fit and Comfort, Fit and Comfort]                            1
[Fit and Comfort, Price and Value]                            1
[Fit and Comfort, Fit and Comfort]                            1
[Fit and Comfort, Fit and Comfort]                            1
[Fit and Comfort, Fit and Comfort]      