# Read Syntetic reviews of fashions 

In [3]:
import pandas as pd

reviews = pd.read_csv('../Syntetic_reviews/reviews_all.csv')

# Create V Database

In [219]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer


import numpy as np


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        #aux['index2'] = aux['index']/aux['index'].sum()

        
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,50))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.mean(same_type_top_similarity)
        self.similar = np.mean(same_type_similarity)

        print(self.very_similar,self.similar)








# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")




for _ in range(5):
    train_reviews = reviews.sample(n=700)
    val_reviews = reviews[~reviews.index.isin(train_reviews.index)]
    
    vector_db = VectorDatabase(nlp, model)
    
    for index, row in train_reviews.iterrows():
        vector_db.insert(row['Review'],row['Polarity'],row['Topic'])
    
    
    vector_db.set_th()
    
    
    guesses = []
    
    
    for index, row in val_reviews.iterrows():
        #print(index)
        review = row['Review']
        aux = vector_db.long_search(review)
        guess = []
        if aux is not None:
            guess = aux.topic.values
        guesses.append(guess)
        
    val_reviews['guesses'] = guesses
    
    
    recalls = []
    precisions= []
    for index, row in val_reviews.iterrows():
        recall = row['Topic'] in row['guesses']
        precision = np.nan
        if len(row['guesses'])>0:
            precision = recall
            
        recalls.append(recall)
        precisions.append(precision)
    
    precision = np.nanmean(np.array(precisions))
    recall = np.nanmean(np.array(recalls))
    
    print('recall: {} precision: {}'.format(recall,precision))



Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
0.35591753724430286 0.41907971093697205
CPU times: user 1min 52s, sys: 891 ms, total: 1min 53s
Wall time: 39.3 s


recall: 0.9035087719298246 precision: 0.911504424778761


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_reviews['guesses'] = guesses
