# Read Syntetic reviews of fashions 

In [3]:
import pandas as pd

reviews = pd.read_csv('../Syntetic_reviews/sample_reviews_fashion.csv')

# Useful classes and functions

In [4]:
import numpy as np

def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
    return distance.cosine(v1, v2)

class VectorDatabase:
    def __init__(self,nlp,model):
        self.vectors = {}
        self.nlp = nlp
        self.model = model
        

    def split_sentences(self, text):
        doc = self.nlp(text, disable=["ner"])
        roots = [token  for token in doc if token.dep_ == "ROOT" ]
    
        texts = []
        for root in roots:
            token_list = [e.i for e in root.subtree]
            token_list = list(dict.fromkeys(token_list))
            token_list.sort()
            text = ' '.join([doc[i].text for i in token_list ])
            texts.append(text.lower().strip())
            
        return texts


    def insert(self, sentence: str, polarity: int, type: str) -> None:
        model = self.model
        embeddings = model.encode(sentence)
        key = len(self.vectors) + 1
        self.vectors[key] = {'text': sentence,
                             'polarity': polarity,
                             'type': type,
                             'vector': embeddings}

    def search(self, query: str):
        model = self.model
        query_vector = model.encode(query)
        
        similarities = [(key, value['text'],distance.cosine(query_vector, value['vector']),value['polarity'],value['type']) for key, value in self.vectors.items()]
        

        aux = pd.DataFrame(similarities)
        aux.columns = ['index_db','text','similarity','polarity','topic']

        aux = aux.reset_index().query('index<10 or similarity<0.6').query('similarity<0.7')[['index','topic']].groupby(['topic']).count()
        
        aux['index2'] = aux['index']/aux['index'].sum()
        
        
        
        return  list(aux.query('index2>0.4 and index>=4').index.values)

    def long_search(self, query: str):
        topics = []
        for str in self.split_sentences(query):
            topics_this = self.search(str)
            if len(topics_this)>0:
                mini_df = pd.DataFrame(topics_this)
                mini_df.columns = ['topic']
                mini_df['review'] = query
                mini_df['sub_review'] = str
                topics.append(mini_df)
        if len(topics)>0:
            
            aux = pd.concat(topics)
            #aux ['stars'] = [int(self.sentiment_pipe(str)[0]['label'][0]) for str in aux.sub_review]
        else:
            aux = None
            
        return  aux



# Create V Database

In [5]:
%%time
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
from typing import List, Tuple
import spacy

from fast_sentence_transformers import FastSentenceTransformer as SentenceTransformer

# use any sentence-transformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu", quantize=True)


nlp = spacy.load("en_core_web_lg")
vector_db = VectorDatabase(nlp, model)

for index, row in reviews.iterrows():
    vector_db.insert(row['Review'],row['Polarity'],row['Topic'])



  from .autonotebook import tqdm as notebook_tqdm


Model found at: /Users/mateograciano/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/quantized_true.onnx
CPU times: user 12.9 s, sys: 1.82 s, total: 14.7 s
Wall time: 5.8 s


# Download database for casio

In [1]:

from google.cloud import bigquery
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../sa.json"


client = bigquery.Client()


sql = '''
SELECT asin, reviewText, overall
FROM `factored.raw_reviews`
inner join `factored.metadata` using(asin)
where brand = 'Casio'
'''

df = client.query(sql).result().to_dataframe()



In [38]:
%%time
all_reviews = []
for index, row in df.iterrows():
    if (row['reviewText'] is not None) & (row['reviewText']!=''):
        reviews = vector_db.long_search(row['reviewText'])
        if reviews is not None:
            reviews = list(reviews.topic.unique())
            reviews.append('Overall')
            reviews = pd.DataFrame(reviews)
            reviews.columns = ['topic']
            reviews['score'] = row['overall']
            reviews['asin'] = row['asin']
            reviews['review'] = row['reviewText']
            all_reviews.append(reviews)

reviews = pd.concat(all_reviews)



CPU times: user 10h 47min 37s, sys: 2min 42s, total: 10h 50min 20s
Wall time: 2h 7min 59s


In [39]:
%%time
job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE",
)

job = client.load_table_from_dataframe(
    reviews, 'factored.casio_reviews_by_topic', job_config=job_config
)  # Make an API request.

job.result()  # Wait for the job to complete.

CPU times: user 204 ms, sys: 41.4 ms, total: 245 ms
Wall time: 5.99 s


LoadJob<project=plenary-stacker-393921, location=US, id=63eb5adf-8dc6-43bd-a687-886ebe0e1adb>