# Read Syntetic reviews of fashions 

In [1]:
import pandas as pd

reviews = pd.read_csv('../Syntetic_reviews/reviews_all_aumented_iter1.csv').dropna()

In [2]:
#reviews = reviews.sample(2000)

# High Precision 

In [3]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer


import numpy as np


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        aux['index2'] = aux['index']/aux['index'].sum()
     
        aux = aux.query('index2>0.6')
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()


        n = data.shape[0]
        if(n>1000):
            n = min(max(int(n*0.1),1000),n)
            data = data.sample(n)
        print('Total size to compute the is {}'.format(data.shape[0]))

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
            
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.percentile(same_type_top_similarity,95)
        self.similar = np.mean(same_type_similarity)




# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")



results = []
for db in range(5):

    print(f'Cross validation #{db+1} of 5')
    sample_size = int(reviews.shape[0]*0.9)
    train_reviews = reviews.sample(n=sample_size)
    val_reviews = reviews[~reviews.index.isin(train_reviews.index)]
    
    vector_db = VectorDatabase(nlp, model)
    print('uploading vectors to DB')
    for index, row in train_reviews.iterrows():
        vector_db.insert(row['Review'],None,row['Topic'])
    
    print('setting thresholds')
    vector_db.set_th()
    print('seted')
    
    
    guesses = []
     
    print('Making Classifications')
    print(val_reviews.shape[0])
    for index, row in val_reviews.iterrows():
        review = row['Review']
        aux = vector_db.long_search(review)
        guess = []
        if aux is not None:
            guess = aux.topic.values
        guesses.append(guess)
        
    val_reviews['guesses'] = guesses
    
    
    recalls = []
    precisions= []
    for index, row in val_reviews.iterrows():
        recall = row['Topic'] in row['guesses']
        precision = np.nan
        if len(row['guesses'])>0:
            precision = recall
            
        recalls.append(recall)
        precisions.append(precision)
    
    precision = np.nanmean(np.array(precisions))
    recall = np.nanmean(np.array(recalls))
    
    print('recall: {} precision: {}'.format(recall,precision))
    results.append((recall,precision))

print(pd.DataFrame(results, columns = ['recall','precision']))

  from .autonotebook import tqdm as notebook_tqdm


Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
Cross validation #1 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.7027932960893855 precision: 0.9515885022692889
Cross validation #2 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.6737430167597765 precision: 0.9466248037676609
Cross validation #3 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.6804469273743017 precision: 0.9471228615863142
Cross validation #4 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.6547486033519553 precision: 0.9406099518459069
Cross validation #5 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895
recall: 0.693854748603352 precision: 0.9324324324324325
     recall  precision
0  0.702793   0.951589
1  0.673743   0.946625
2  0.680447   0.947123
3  0.654749   0.940610
4  0.693855   0.932432
CPU times: user 54min 17s, sys: 12.4 s, total: 54min 30s
Wall time: 22min 12s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# High Precision

In [4]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer


import numpy as np


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        aux['index2'] = aux['index']/aux['index'].sum()
     
        aux = aux.query('index2>0.6')
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()


        n = data.shape[0]
        if(n>1000):
            n = min(max(int(n*0.1),1000),n)
            data = data.sample(n)
        print('Total size to compute the is {}'.format(data.shape[0]))

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
            
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.percentile(same_type_top_similarity,50)
        self.similar = np.mean(same_type_similarity)




# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")



results = []
for db in range(5):

    print(f'Cross validation #{db+1} of 5')
    sample_size = int(reviews.shape[0]*0.9)
    train_reviews = reviews.sample(n=sample_size)
    val_reviews = reviews[~reviews.index.isin(train_reviews.index)]
    
    vector_db = VectorDatabase(nlp, model)
    print('uploading vectors to DB')
    for index, row in train_reviews.iterrows():
        vector_db.insert(row['Review'],None,row['Topic'])
    
    print('setting thresholds')
    vector_db.set_th()
    print('seted')
    
    
    guesses = []
     
    print('Making Classifications')
    print(val_reviews.shape[0])
    for index, row in val_reviews.iterrows():
        review = row['Review']
        aux = vector_db.long_search(review)
        guess = []
        if aux is not None:
            guess = aux.topic.values
        guesses.append(guess)
        
    val_reviews['guesses'] = guesses
    
    
    recalls = []
    precisions= []
    for index, row in val_reviews.iterrows():
        recall = row['Topic'] in row['guesses']
        precision = np.nan
        if len(row['guesses'])>0:
            precision = recall
            
        recalls.append(recall)
        precisions.append(precision)
    
    precision = np.nanmean(np.array(precisions))
    recall = np.nanmean(np.array(recalls))
    
    print('recall: {} precision: {}'.format(recall,precision))
    results.append((recall,precision))

pd.DataFrame(results, columns = ['recall','precision'])

Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
Cross validation #1 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.6916201117318436 precision: 0.9493865030674846
Cross validation #2 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.6569832402234637 precision: 0.9333333333333333
Cross validation #3 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.6849162011173184 precision: 0.9533437013996889
Cross validation #4 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.6770949720670391 precision: 0.9528301886792453
Cross validation #5 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895
recall: 0.6636871508379888 precision: 0.9324960753532182
CPU times: user 1h 43min 31s, sys: 27.5 s, total: 1h 43min 58s
Wall time: 40min 50s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,recall,precision
0,0.69162,0.949387
1,0.656983,0.933333
2,0.684916,0.953344
3,0.677095,0.95283
4,0.663687,0.932496


# High recall

In [5]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer


import numpy as np


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        aux['index2'] = aux['index']/aux['index'].sum()
     
        #aux = aux.query('index2>0.6')
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()


        n = data.shape[0]
        if(n>1000):
            n = min(max(int(n*0.1),1000),n)
            data = data.sample(n)
        print('Total size to compute the is {}'.format(data.shape[0]))

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
            
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.percentile(same_type_top_similarity,75)
        self.similar = np.mean(same_type_similarity)




# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")



results = []
for db in range(5):

    print(f'Cross validation #{db+1} of 5')
    sample_size = int(reviews.shape[0]*0.9)
    train_reviews = reviews.sample(n=sample_size)
    val_reviews = reviews[~reviews.index.isin(train_reviews.index)]
    
    vector_db = VectorDatabase(nlp, model)
    print('uploading vectors to DB')
    for index, row in train_reviews.iterrows():
        vector_db.insert(row['Review'],None,row['Topic'])
    
    print('setting thresholds')
    vector_db.set_th()
    print('seted')
    
    
    guesses = []
     
    print('Making Classifications')
    print(val_reviews.shape[0])
    for index, row in val_reviews.iterrows():
        review = row['Review']
        aux = vector_db.long_search(review)
        guess = []
        if aux is not None:
            guess = aux.topic.values
        guesses.append(guess)
        
    val_reviews['guesses'] = guesses
    
    
    recalls = []
    precisions= []
    for index, row in val_reviews.iterrows():
        recall = row['Topic'] in row['guesses']
        precision = np.nan
        if len(row['guesses'])>0:
            precision = recall
            
        recalls.append(recall)
        precisions.append(precision)
    
    precision = np.nanmean(np.array(precisions))
    recall = np.nanmean(np.array(recalls))
    
    print('recall: {} precision: {}'.format(recall,precision))
    results.append((recall,precision))

pd.DataFrame(results, columns = ['recall','precision'])

Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
Cross validation #1 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.8201117318435754 precision: 0.8201117318435754
Cross validation #2 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.829050279329609 precision: 0.829050279329609
Cross validation #3 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.8189944134078212 precision: 0.8189944134078212
Cross validation #4 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


recall: 0.8055865921787709 precision: 0.8055865921787709
Cross validation #5 of 5
uploading vectors to DB
setting thresholds
Total size to compute the is 1000
seted
Making Classifications
895
recall: 0.8122905027932961 precision: 0.8122905027932961
CPU times: user 2h 5min 19s, sys: 34.5 s, total: 2h 5min 54s
Wall time: 46min 1s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,recall,precision
0,0.820112,0.820112
1,0.82905,0.82905
2,0.818994,0.818994
3,0.805587,0.805587
4,0.812291,0.812291
