# Extracting Car review data and making it ChromaDB friendly

In [2]:
from data_etl import prepare_car_reviews_data
from text_wrap import wrap

# Note how we use a wildcard here
DATA_PATH = "data/archive/*"

chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH)
chroma_car_reviews_dict.keys()

dict_keys(['ids', 'documents', 'metadatas'])

In [3]:
# See a review
print(chroma_car_reviews_dict["ids"][-10])
print(wrap(chroma_car_reviews_dict["documents"][-10], 100))


review5860
 I've never had a perfect car for me but this is quite close. My husband refused to ever drive an
SUV but he loves this car. He's always looking for an excuse to drive. It has power! There are lots
of extras in the inscription that are worth the extra money. I like the larger tires and leather
quality especially. My kids love the car too and prefer to ride in the "way back". Some of the
technology is tricky but you do get the hang of it and there are so many features that it's worth
the effort to learn it.


In [7]:
# What metadata is in there
chroma_car_reviews_dict.get('metadatas')[0:5]

[{'Review_Title': 'Never again!',
  'Rating': 1.0,
  'Vehicle_Year': 2017,
  'Vehicle_Model': 'Acura'},
 {'Review_Title': 'Brake System Issue',
  'Rating': 1.0,
  'Vehicle_Year': 2017,
  'Vehicle_Model': 'Acura'},
 {'Review_Title': 'A great car with a major flaw',
  'Rating': 1.0,
  'Vehicle_Year': 2017,
  'Vehicle_Model': 'Acura'},
 {'Review_Title': 'Take your shutter and stick it!',
  'Rating': 1.0,
  'Vehicle_Year': 2017,
  'Vehicle_Model': 'Acura'},
 {'Review_Title': '2017 Acura RDX lemon',
  'Rating': 1.0,
  'Vehicle_Year': 2017,
  'Vehicle_Model': 'Acura'}]

This is effectively what is going to happen. We assign an id to a specfic review and then add metadata. We are now going to put all of this into the ChromaDB.

# Building the collection

In [4]:
import chromadb
from chromadb.utils import embedding_functions
from data_etl import prepare_car_reviews_data
from chroma_utils import build_chroma_collection

DATA_PATH = "data/archive/*"
CHROMA_PATH = "car_review_embeddings"
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
COLLECTION_NAME = "car_reviews"

# Only trigger this function once
# build_chroma_collection(
#     CHROMA_PATH,
#     COLLECTION_NAME,
#     EMBEDDING_FUNC_NAME,
#     chroma_car_reviews_dict["ids"],
#     chroma_car_reviews_dict["documents"],
#     chroma_car_reviews_dict["metadatas"]
# )

Downloading .gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

# Running queries on the collection

In [12]:
# Tells where db is located at (persistentclient reads from disc :))
client = chromadb.PersistentClient(CHROMA_PATH)

# Which embedding function to use for IO of the database
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBEDDING_FUNC_NAME
    )

# Retrieve collection
collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_func)

# Make a query to the database, note how we ask in natural language, and get an answer in natural language. Magic!
great_reviews = collection.query(
    query_texts=["Find me some positive reviews that discuss a BMW's performance"],
    n_results=5,
    include=["documents", "distances", "metadatas"]
)

print(wrap(great_reviews["documents"][0][0], 100))

 I have had two other BMWs in the past and loved them.  This is my first foray into BMWs SUV world.
I have to say that I love this vehicle just as much if not more than my others.  It is a medium size
SUV but is engineered inside to provide maximum storage and comfort.  With the upgrade to the 6
cylinder (which is the engine I am use to having), the fun to drive factor and performance is
definitely still there.  My family had to run from hurricane Irma and this was our vehicle of choice
to do so because we could count on its reliability, mpg, and roominess for us to take our two dogs
and other essentials.
