# Read Syntetic reviews of fashions 

In [3]:
import pandas as pd

reviews = pd.read_csv('../Syntetic_reviews/reviews_all.csv')

# Create V Database

In [280]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer


import numpy as np


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        #aux['index2'] = aux['index']/aux['index'].sum()

        
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.percentile(same_type_top_similarity,95)
        self.similar = np.mean(same_type_similarity)




# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")




for db in range(5):

    print(f'Cross validation #{db+1} of 5')
    
    train_reviews = reviews.sample(n=700)
    val_reviews = reviews[~reviews.index.isin(train_reviews.index)]
    
    vector_db = VectorDatabase(nlp, model)
    print('uploading vectors to DB')
    for index, row in train_reviews.iterrows():
        vector_db.insert(row['Review'],row['Polarity'],row['Topic'])
    
    print('setting thresholds')
    vector_db.set_th()
    
    
    guesses = []
    
    
    for index, row in val_reviews.iterrows():
        #print(index)
        review = row['Review']
        aux = vector_db.long_search(review)
        guess = []
        if aux is not None:
            guess = aux.topic.values
        guesses.append(guess)
        
    val_reviews['guesses'] = guesses
    
    print('Making Classifications')
    recalls = []
    precisions= []
    for index, row in val_reviews.iterrows():
        recall = row['Topic'] in row['guesses']
        precision = np.nan
        if len(row['guesses'])>0:
            precision = recall
            
        recalls.append(recall)
        precisions.append(precision)
    
    precision = np.nanmean(np.array(precisions))
    recall = np.nanmean(np.array(recalls))
    
    print('recall: {} precision: {}'.format(recall,precision))



Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
Cross validation #1 of 5
uploading vectors to DB
setting thresholds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Making Classifications
recall: 0.9122807017543859 precision: 0.9122807017543859
Cross validation #2 of 5
uploading vectors to DB
setting thresholds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Making Classifications
recall: 0.9122807017543859 precision: 0.9203539823008849
Cross validation #3 of 5
uploading vectors to DB
setting thresholds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Making Classifications
recall: 0.9649122807017544 precision: 0.9649122807017544
Cross validation #4 of 5
uploading vectors to DB
setting thresholds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Making Classifications
recall: 0.9035087719298246 precision: 0.9035087719298246
Cross validation #5 of 5
uploading vectors to DB
setting thresholds
Making Classifications
recall: 0.9298245614035088 precision: 0.9380530973451328
CPU times: user 11min 5s, sys: 4.58 s, total: 11min 9s
Wall time: 4min


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [244]:




import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy
from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer


import numpy as np


class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        self.very_similar = 0.5
        self.similar = 0.5
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']
        aux = aux.sort_values(by=['similarity']).reset_index(drop=True).reset_index()

        #aux = aux.reset_index().query('index<20 or similarity<0.7').query('similarity<1')[['index','topic']].groupby(['topic']).count()
        
        aux = aux.query('index<=10')
        #aux = aux.query('similarity <={}'.format(self.very_similar))

        aux = aux.query('similarity <={}'.format(self.similar))
        
        aux = aux[['index','topic']].groupby(['topic']).count()
        
        
        #aux['index2'] = aux['index']/aux['index'].sum()

        
        

        aux = aux.sort_values(by='index', ascending=False).head(1)
                
        return  list(aux.index.unique())

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux

    def set_th(self):
        data = pd.DataFrame(self.vectors).transpose()

        same_type_similarity = []
        
        same_type_top_similarity = []
        
        for i in range(len(data.vector)):
        
            vectors = data.vector
            vector = vectors.values[i]
            aux = pd.DataFrame(
                [distance.cosine(vector, vectors[i]) for i in vectors.keys()]
            )
            
            aux.columns = ['similarity']
            
            aux['topic'] = data.type.values
            
            topic_review = data.type.values[i]
             
            same_type_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,75))
        
            same_type_top_similarity.append(np.percentile(aux.query(f'topic=="{topic_review}"').similarity,25))
        
        self.very_similar = np.mean(same_type_top_similarity)
        self.similar = np.mean(same_type_similarity)




# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")





vector_db = VectorDatabase(nlp, model)
print('uploading vectors to DB')
for index, row in reviews.iterrows():
    vector_db.insert(row['Review'],row['Polarity'],row['Topic'])

print('setting thresholds')
vector_db.set_th()
    



example_review = '''
Stylish Timekeeping Elegance with Metallic Flair!

I recently acquired this exquisite clock, and I must say, it's a mesmerizing blend of functionality and aesthetics. The metallic details and choice of materials elevate its charm to a whole new level.

Let's start with the metallic details - they are nothing short of captivating. The clock's face features intricate metallic accents that catch the light beautifully, adding a touch of elegance and sophistication to any space. It's like having a piece of art adorning your wall, constantly drawing your gaze with its shimmering allure.

The metallic elements aren't just for show; they also serve a functional purpose. The minute and hour hands, crafted with a sleek metallic finish, provide a clear contrast against the clock's background, ensuring effortless readability even from a distance. There's no squinting or straining your eyes to tell the time; it's like having a timekeeper tailor-made for convenience.

Now, let's talk about the material. The clock's body is built from high-quality materials that exude a sense of sturdiness and durability. The frame, carefully crafted with a combination of premium metals and other robust elements, feels solid to the touch, instilling confidence in its long-lasting performance.

Not only is the clock durable, but it also boasts a luxurious feel. The metallic accents extend to the edges of the frame, creating a seamless and refined finish that adds a touch of opulence to any room. It's a statement piece that effortlessly complements both modern and classic décor.

The attention to detail in this clock is truly commendable. From the precise cut of the metallic elements to the careful assembly, it's evident that the manufacturers poured their passion into creating a timepiece that stands out in both form and function.

In conclusion, this clock is a stunning blend of metallic details and high-quality materials, marrying elegance with functionality flawlessly. Whether you're looking to add a touch of sophistication to your home or seeking a reliable timekeeping companion, this clock exceeds expectations on every front. Its impeccable craftsmanship and striking appearance make it a valuable addition to any space, destined to garner admiration from anyone who sets their eyes on it.

'''


vector_db.long_search(example_review)

Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
uploading vectors to DB
setting thresholds


Unnamed: 0,topic,review,sub_review
0,Design,\nStylish Timekeeping Elegance with Metallic F...,stylish timekeeping elegance with metallic fla...
0,Design,\nStylish Timekeeping Elegance with Metallic F...,"i recently acquired this exquisite clock , and..."
0,Material and Quality,\nStylish Timekeeping Elegance with Metallic F...,the metallic details and choice of materials e...
0,Material and Quality,\nStylish Timekeeping Elegance with Metallic F...,"now , let 's talk about the material ."
0,Longevity,\nStylish Timekeeping Elegance with Metallic F...,the clock 's body is built from high - quality...
0,Longevity,\nStylish Timekeeping Elegance with Metallic F...,"not only is the clock durable , but it also bo..."
0,Design,\nStylish Timekeeping Elegance with Metallic F...,it 's a statement piece that effortlessly comp...
0,Design,\nStylish Timekeeping Elegance with Metallic F...,its impeccable craftsmanship and striking appe...


In [276]:

example_review = '''
bought these shoes for my girlfriend .
'''


topics_details = vector_db.long_search(example_review)

topics_details

#topics_details.to_csv('example_details.csv')

Unnamed: 0,topic,review,sub_review
0,Fit and Comfort,\nbought these shoes for my girlfriend .\n,bought these shoes for my girlfriend .


In [266]:
from google.cloud import bigquery
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../sa.json"


client = bigquery.Client()


sql = '''

SELECT reviewText,overall,asin
from `factored.raw_reviews`
WHERE asin = 'B00HDZIT0S'
'''

df = client.query(sql).result().to_dataframe()


In [273]:
vector_db.long_search(review)

Unnamed: 0,topic,review,sub_review
0,Fit and Comfort,Bought these shoes for my girlfriend. She said...,bought these shoes for my girlfriend .


In [278]:
ans= []
for index, row in df.iterrows():
    print(index)
    review = row['reviewText']
    if review is None:
        review = ''
    if (len(review)>0):
        aux = vector_db.long_search(review)
        if aux is not None:
            aux['stars'] = row['overall']
            ans.append(aux)
            #print(review,ans)

ans = pd.concat(ans)


df['topic']= 'overall'
overall = df[['topic','overall']].groupby(['topic']).mean()
overall.columns = ['stars']


topics = ans[['topic','stars']].groupby(['topic']).mean()


pd.concat([overall,topics])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240


Unnamed: 0_level_0,stars
topic,Unnamed: 1_level_1
overall,3.809129
Customer Support,3.0
Design,4.666667
Fit and Comfort,3.964286
Longevity,3.2
Material and Quality,3.6
Packaging and Presentation,4.0
Price and Value,3.5
User Experience,4.166667
Versatility,3.5
