# Read Syntetic reviews of fashions 

In [1]:
import pandas as pd

reviews = pd.read_csv('../Syntetic_reviews/reviews_all_aumented_iter1.csv').dropna()

In [2]:
#reviews = reviews.sample(1000)

# High Precision 

In [2]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer


import numpy as np


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        aux['index2'] = aux['index']/aux['index'].sum()
     
        aux = aux.query('index2>0.6')
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()


        n = data.shape[0]
        if(n>1000):
            n = min(max(int(n*0.1),1000),n)
            data = data.sample(n)
        print('Total size to compute the is {}'.format(data.shape[0]))

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
            
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.percentile(same_type_top_similarity,95)
        self.similar = np.percentile(same_type_similarity,95)




# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")



results = []
for db in range(5):

    print(f'Cross validation #{db+1} of 5')
    sample_size = int(reviews.shape[0]*0.9)
    train_reviews = reviews.sample(n=sample_size)
    val_reviews = reviews[~reviews.index.isin(train_reviews.index)]
    
    vector_db = VectorDatabase(nlp, model)
    print('uploading vectors to DB')
    for index, row in train_reviews.iterrows():
        vector_db.insert(row['Review'],None,row['Topic'])
    
    print('setting thresholds')
    vector_db.set_th()
    print('seted')
    
    
    guess = [] 
    recalls = []
    precisions = []
    for index, row in val_reviews.iterrows():
        aux = vector_db.long_search(row['Review'])
        my_guess = []
        if aux is not None:
            my_guess = list(aux.topic.unique())
        real = [row.Topic]
        if len(real)>0:
            #real = list(row.dropna().index)[1:] 
            recall = pd.DataFrame(real)
            recall.columns = ['topic']
            recall['value'] = [ t in my_guess for t in real]
            recalls.append(recall)
        
        if len(my_guess)>0:
            precision = pd.DataFrame(my_guess)
            precision.columns = ['topic']
            precision['value'] = [ t in real for t in my_guess]
            precisions.append(precision)
    
    recall = pd.concat(recalls).groupby(['topic']).mean().reset_index()
    precision = pd.concat(precisions).groupby(['topic']).mean().reset_index()
    
    metrics = recall.merge(precision, on = ['topic'])
    
    metrics.columns = ['topic','recall','precision']
    
    print(metrics)
    print(precision.value.mean(),recall.value.mean())
    




  from .autonotebook import tqdm as notebook_tqdm


Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
Cross validation #1 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
                        topic    recall  precision
0                       Brand  0.990654   0.990654
1            Customer Support  0.731959   1.000000
2                      Design  0.614865   0.866667
3             Fit and Comfort  0.507692   0.846154
4                   Longevity  0.709677   0.956522
5        Material and Quality  0.440678   0.787879
6  Packaging and Presentation  0.687500   0.942857
7             Price and Value  0.795918   0.975000
8             User Experience  0.520000   0.981132
9                 Versatility  0.539683   0.971429
0.9318293035194625 0.6538626433537258
Cross validation #2 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
                        topic    recall  precision
0  

# High Precision

In [3]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer


import numpy as np


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        aux['index2'] = aux['index']/aux['index'].sum()
     
        aux = aux.query('index2>0.6')
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()


        n = data.shape[0]
        if(n>1000):
            n = min(max(int(n*0.1),1000),n)
            data = data.sample(n)
        print('Total size to compute the is {}'.format(data.shape[0]))

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
            
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.percentile(same_type_top_similarity,50)
        self.similar = np.percentile((same_type_similarity,50)




# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")



results = []
for db in range(5):

    print(f'Cross validation #{db+1} of 5')
    sample_size = int(reviews.shape[0]*0.9)
    train_reviews = reviews.sample(n=sample_size)
    val_reviews = reviews[~reviews.index.isin(train_reviews.index)]
    
    vector_db = VectorDatabase(nlp, model)
    print('uploading vectors to DB')
    for index, row in train_reviews.iterrows():
        vector_db.insert(row['Review'],None,row['Topic'])
    
    print('setting thresholds')
    vector_db.set_th()
    print('seted')
    
    
    guess = [] 
    recalls = []
    precisions = []
    for index, row in val_reviews.iterrows():
        aux = vector_db.long_search(row['Review'])
        my_guess = []
        if aux is not None:
            my_guess = list(aux.topic.unique())
        real = [row.Topic]
        if len(real)>0:
            #real = list(row.dropna().index)[1:] 
            recall = pd.DataFrame(real)
            recall.columns = ['topic']
            recall['value'] = [ t in my_guess for t in real]
            recalls.append(recall)
        
        if len(my_guess)>0:
            precision = pd.DataFrame(my_guess)
            precision.columns = ['topic']
            precision['value'] = [ t in real for t in my_guess]
            precisions.append(precision)
    
    recall = pd.concat(recalls).groupby(['topic']).mean().reset_index()
    precision = pd.concat(precisions).groupby(['topic']).mean().reset_index()
    
    metrics = recall.merge(precision, on = ['topic'])
    
    metrics.columns = ['topic','recall','precision']
    
    print(metrics)
    print(precision.value.mean(),recall.value.mean())

pd.DataFrame(results, columns = ['recall','precision'])

SyntaxError: '(' was never closed (<unknown>, line 128)

# High recall

In [6]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer


import numpy as np


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        aux['index2'] = aux['index']/aux['index'].sum()
     
        #aux = aux.query('index2>0.6')
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()


        n = data.shape[0]
        if(n>1000):
            n = min(max(int(n*0.1),1000),n)
            data = data.sample(n)
        print('Total size to compute the is {}'.format(data.shape[0]))

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
            
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.percentile(same_type_top_similarity,75)
        self.similar = np.mean(same_type_similarity)




# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")



results = []
for db in range(5):

    print(f'Cross validation #{db+1} of 5')
    sample_size = int(reviews.shape[0]*0.9)
    train_reviews = reviews.sample(n=sample_size)
    val_reviews = reviews[~reviews.index.isin(train_reviews.index)]
    
    vector_db = VectorDatabase(nlp, model)
    print('uploading vectors to DB')
    for index, row in train_reviews.iterrows():
        vector_db.insert(row['Review'],None,row['Topic'])
    
    print('setting thresholds')
    vector_db.set_th()
    print('seted')
    
    
    guess = [] 
    recalls = []
    precisions = []
    for index, row in val_reviews.iterrows():
        aux = vector_db.long_search(row['Review'])
        my_guess = []
        if aux is not None:
            my_guess = list(aux.topic.unique())
        real = [row.Topic]
        if len(real)>0:
            #real = list(row.dropna().index)[1:] 
            recall = pd.DataFrame(real)
            recall.columns = ['topic']
            recall['value'] = [ t in my_guess for t in real]
            recalls.append(recall)
        
        if len(my_guess)>0:
            precision = pd.DataFrame(my_guess)
            precision.columns = ['topic']
            precision['value'] = [ t in real for t in my_guess]
            precisions.append(precision)
    
    recall = pd.concat(recalls).groupby(['topic']).mean().reset_index()
    precision = pd.concat(precisions).groupby(['topic']).mean().reset_index()
    
    metrics = recall.merge(precision, on = ['topic'])
    
    metrics.columns = ['topic','recall','precision']
    
    print(metrics)
    print(precision.value.mean(),recall.value.mean())

pd.DataFrame(results, columns = ['recall','precision'])

Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
Cross validation #1 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
                        topic    recall  precision
0                       Brand  1.000000   0.971154
1            Customer Support  0.797753   0.816092
2                      Design  0.912752   0.576271
3             Fit and Comfort  0.761905   0.813559
4                   Longevity  0.857143   0.960000
5        Material and Quality  0.696970   0.707692
6  Packaging and Presentation  0.702970   0.771739
7             Price and Value  0.923913   0.850000
8             User Experience  0.645455   0.739583
9                 Versatility  0.720588   0.890909
0.8097000171020925 0.8019447924115056
Cross validation #2 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
                        topic    recall  precision
0  

Unnamed: 0,recall,precision
