# Syntetic reviews of fashions 

In [28]:
import sys
import json
from typing import List, Tuple
from collections import defaultdict

import spacy
import numpy as np
import pandas as pd
from scipy.spatial import distance
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer
from transformers import pipeline

sys.path.insert(0, '../')
from upload_data.utils import execute_query


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.8
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence, normalize_embeddings = True)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']), value['type']) for key, value in self.vectors.items()]
        
        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()
        
        sims = 1/(1+aux['similarity'].values)
        norm_sim = sims/np.sum(sims)
        information = -np.log2(norm_sim)
        entropy = np.sum(norm_sim*information)
        
        aux = aux.query('index<=10')
        aux = aux.query('similarity <={}'.format(self.similar))
        aux = aux[['index','topic']].groupby(['topic']).count()

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values), entropy

    def long_search(self, query: str):
        topics = []
        avg_entropy = []
        for str in self.split_sentences(query):
            topics_this, entropy = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
                avg_entropy.append(entropy)
        if len(topics)>0:
            
            aux = pd.concat(topics)
        else:
            aux = None
        
        avg_entropy = np.mean(avg_entropy)
        return  aux, avg_entropy

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.percentile(same_type_top_similarity,95)
        self.similar = np.mean(same_type_similarity)


#### Load models: 

Syntethic reviews, model and nlp preprocessing pipeline

In [29]:
with open('syn_reviews.json', 'r') as file:
    syn_reviews = json.load(file)
    file.close()
    
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)
nlp = spacy.load("en_core_web_lg")

Model found at: C:\Users\david/.cache\torch\sentence_transformers\sentence-transformers_all-MiniLM-L6-v2\quantized_true.onnx


#### Train vector base

In [30]:
vector_db = VectorDatabase(nlp, model)
print('uploading vectors to DB')
for topic, reviews in syn_reviews.items():
    for review in reviews:
        vector_db.insert(review,topic)

uploading vectors to DB


#### Extract real data and predict on these reviews

In [6]:
query = '''
SELECT DISTINCT reviewText
FROM `plenary-stacker-393921.factored.raw_reviews` TABLESAMPLE SYSTEM (5 PERCENT)
WHERE asin = 'B00M4NF9H0' AND reviewText IS NOT NULL
LIMIT 220
'''

val_reviews = execute_query(query)

In [31]:
guesses = []
entropy = []
for _, row in val_reviews.iterrows():
    review = row['reviewText']
    aux, avg_entropy = vector_db.long_search(review)
    guess = []
    if aux is not None:
        guess = aux.topic.values
    guesses.append(guess)
    entropy.append(avg_entropy)
    
val_reviews['guesses'] = guesses
val_reviews['entropy'] = entropy

In [27]:
val_reviews.to_excel('prueba.xlsx')

#### Zero shot learning base model: Bart model tuned by Meta

In [22]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

candidate_labels = ['Design','Brand','Packaging and Presentation','Price and Value','Customer Support','User Experience','Material and Quality','Fit and Comfort','Versatility','Longevity']

result_val = val_reviews['reviewText'].apply(lambda x: classifier(x, candidate_labels, multiclass = True))
val_reviews['model'] = [[label for score, label in zip(result['scores'], result['labels']) if score > 0.15] for result in result_val]


KeyboardInterrupt



In [None]:
val_reviews.to_excel('two_models.xlsx')

In [None]:
val_reviews = pd.read_excel('two_models.xlsx')
val_reviews['guesses'] = val_reviews['guesses'].apply(lambda x: x.replace("['","").replace("']","").replace("\n", "").split("' '"))
val_reviews['model'] = val_reviews['model'].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").split(', '))

In [28]:
val_reviews.to_excel('comparison.xlsx')