In [1]:
import os
import sys

import numpy as np

run_env = os.getenv('RUN_ENV', 'COLLAB')
if run_env == 'COLLAB':
  from google.colab import drive
  ROOT_DIR = '/content/drive'
  drive.mount(ROOT_DIR)
  print('Google drive connected')
  root_data_dir = os.path.join(ROOT_DIR, 'MyDrive', 'ml_course_data')
  lib_path = os.path.join(ROOT_DIR, 'MyDrive', 'src')
  if not os.path.exists(lib_path):
    raise RuntimeError('Upload and `src` dir with code')
  sys.path.append(lib_path)
else:
  root_data_dir = os.getenv('DATA_DIR', '/srv/data')

print(os.listdir(root_data_dir))

if not os.path.exists(root_data_dir):
  raise RuntimeError('Data dir not exists')
else:
  print('Data dir content %s: %s' % (root_data_dir, ', '.join(os.listdir(root_data_dir))))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google drive connected
['nyt-ingredients-snapshot-2015.csv', 'insurance (1).csv', 'non_linear.csv', 'client_segmentation.csv', 'eigen.pkl', 'clustering.pkl', 'boosting_toy_dataset.csv', 'politic_meme.jpg', 'gray_goose.jpg', 'test_dataset.pkl', 'memes', 'optimal_push_time', 'sklearn_data', 'my_little_recsys', 'corpora', 'logs', 'nltk_data', 'recsys_data', 'MNIST', 'hymenoptera_data', 'pet_projects', 'ocr_dataset_sample.csv', 'geo_points.csv.gzip', 'scored_corpus.csv', 'labeled_data_corpus.csv', 'memes_stat_dataset.zip', 'als_model.pkl', 'raw_data.zip', 'json_views.tar.gz', 'test_data.csv', 'sales_timeseries_dataset.csv.gz', 'brand_tweets_valid.csv', 'brand_tweets.csv', 'Health_and_Personal_Care.jsonl.gz', 'models', 'final_dataset.zip', 'ocr_dataset.zip', 'bidmachine_logs.zip', 'meta_Health_and_Personal_Care.jsonl.gz', 'messages.db', 'user_item_views.zip', 'con

It is critical to get access for GPU for fast embeddings evaluation

In [2]:
import torch

device = torch.device(
    [f'cuda:{i}' for i in range(torch.cuda.device_count())][0]
    if torch.cuda.is_available() else 'cpu'
)
if not 'cuda' in str(device):
    raise RuntimeError('GPU is not available')
print('torch device: %s' % device)

torch device: cuda:0


In [3]:
models_dir = os.path.join(root_data_dir, 'models')
if not os.path.exists(models_dir):
    os.mkdir(models_dir)
print(f'Models dir: {models_dir}')

Models dir: /content/drive/MyDrive/ml_course_data/models


In [4]:
import os
with open(os.path.join(ROOT_DIR, 'MyDrive', 'secrets', 'secrets.env'), 'r') as f:
  envs = {j[0]: j[1] for j in [i.strip().split('=') for i in f.readlines()]}
print(len(envs))

3


In [5]:
from IPython.display import clear_output

!pip install langchain_community python-dotenv langchain_huggingface backoff
# !pip install faiss-gpu-cu11
clear_output()
print('Packages installed')

Packages installed


# RAG dev

[amazon reviews](https://amazon-reviews-2023.github.io/)

Plan
* rag over one dataset (healthcare)
* rag over two datasets (healthcare + electronics)

In [6]:
import gzip
import json

def read_raw_data(file_name, limit: int, fields = None):

  file_path = os.path.join(root_data_dir, file_name)
  res = []
  with gzip.open(file_path, 'rt') as gz_file:
      for line in gz_file:
          data = json.loads(line.strip())
          if fields is not None:
            res.append({i: j for i, j in data.items() if i in fields})
          else:
            res.append(data)
          if limit == len(res):
              break
  print('Dataset num items: %d' % len(res))
  return res

read_raw_data('Health_and_Personal_Care.jsonl.gz', limit = 3)

Dataset num items: 3


[{'rating': 4.0,
  'title': '12 mg is 12 on the periodic table people! Mg for magnesium',
  'text': 'This review is more to clarify someone else’s review bc they didn’t understand understand the labeling!  It shows 1000mg as advertised & another little label says 12mg bc 12 is on the periodic table for magnesium!  I realize not everyone takes chemistry, but 4 ppl liked his review & so misinformation is spreading.  This works. If however you are on opiate level medications that are causing constipation you should talk to your pain dr or your gastrointestinal dr & ask for a medication called Linzess which works must better & must faster, but is unnecessary for most people.  If magnesium is working for you just make sure to take it with food & drink 6-8 glasses of water per day.  Staying hydrated will really help.  Before switching to Linzess I used to take one 1,000 mg pill am & pm every day with meals & always with an 8 ounce glass of water or other liquid.',
  'images': [],
  'asin': '

In [7]:
import pandas as pd

def count_products_popularity(raw_data_entries):
  db_sample_df = pd.json_normalize(
      raw_data_entries
  )
  popularity_df = (
    db_sample_df
    .groupby('asin')['user_id']
    .count()
    .reset_index(name='popularity')
    .sort_values(by='popularity', ascending=False)
  )
  return popularity_df

count_products_popularity(
    read_raw_data('Health_and_Personal_Care.jsonl.gz', limit=1000, fields = None)
).head()

Dataset num items: 1000


Unnamed: 0,asin,popularity
350,B07HSF5HTX,7
9,B000G2BESO,7
827,B09F7PH4QR,4
343,B07GHG9P9P,4
351,B07J32R15F,3


In [8]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"

embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    cache_folder=models_dir,
    model_kwargs={"device": device}  # 👈 force GPU, if available
)

corpus_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming industries."
]

# Generate embeddings for documents
passage_embeddings = embedding_model.embed_documents(corpus_texts)

print(f"Generated {len(passage_embeddings)} embeddings")
print(f"Each embedding vector has dimension {len(passage_embeddings[0])}")


Generated 2 embeddings
Each embedding vector has dimension 768


In [9]:
from collections import defaultdict


def load_corpus(db, max_reviews_per_product = None, product_filter = None, min_text_length = 50 ):
    corpus_texts = []
    product_entry_counter = defaultdict(int)
    if product_filter is None:
      product_filter = set([item['asin'] for item in db])
      print(f'Num products in filter {len(product_filter)}')
    for item in db:
      if item['asin'] not in product_filter:
        continue
      if max_reviews_per_product is not None and product_entry_counter[item['asin']] > max_reviews_per_product:
        continue
      if len(item['text']) > min_text_length:
        product_entry_counter[item['asin']] += 1
        corpus_texts.append({'doc_id': item['asin'], 'text': item['text']})
    return corpus_texts

def train_embeds(corpus_texts, embedder, sentence_embedding_path, overwrite=False):
    if os.path.exists(sentence_embedding_path) and not overwrite:
        print('corpus loading from %s' % sentence_embedding_path)
        passage_embeddings = np.load(sentence_embedding_path)
    else:
        print('num rows %d' % len(corpus_texts))
        passage_embeddings = embedding_model.embed_documents(corpus_texts)
        passage_embeddings = np.array([embedding for embedding in passage_embeddings]).astype("float32")
        with open(sentence_embedding_path, 'wb') as f:
            np.save(f, passage_embeddings)
        print('corpus saved to %s' % sentence_embedding_path)
    print('Num embeddings %d' % passage_embeddings.shape[0])
    return passage_embeddings

db = read_raw_data(
    'Health_and_Personal_Care.jsonl.gz',
    limit = -1, fields = ['rating', 'text', 'title', 'asin', 'user_id']
)
products_popularity_df = count_products_popularity(db)
print(f'Num rows {products_popularity_df.shape[0]}')
products_popularity_df.head()

Dataset num items: 494121
Num rows 62597


Unnamed: 0,asin,popularity
850,B000G2BESO,2840
4562,B004C7MTLA,2196
312,B0001ZWPI4,2095
6228,B0077L8YFI,1992
15616,B00XQBHJOU,1952


In [10]:
products_filter = set(products_popularity_df.head(10000)['asin'].values)
corpus = load_corpus(db, max_reviews_per_product=4, product_filter=products_filter)
print(f'Corpus length {len(corpus)}')
corpus[0]

Corpus length 49622


{'doc_id': 'B07TDSJZMR',
 'text': 'This review is more to clarify someone else’s review bc they didn’t understand understand the labeling!  It shows 1000mg as advertised & another little label says 12mg bc 12 is on the periodic table for magnesium!  I realize not everyone takes chemistry, but 4 ppl liked his review & so misinformation is spreading.  This works. If however you are on opiate level medications that are causing constipation you should talk to your pain dr or your gastrointestinal dr & ask for a medication called Linzess which works must better & must faster, but is unnecessary for most people.  If magnesium is working for you just make sure to take it with food & drink 6-8 glasses of water per day.  Staying hydrated will really help.  Before switching to Linzess I used to take one 1,000 mg pill am & pm every day with meals & always with an 8 ounce glass of water or other liquid.'}

In [11]:
data_version = 2
embeds = train_embeds(
    [i['text'] for i in corpus], embedding_model,
    os.path.join(root_data_dir, f'corpus_embeds_{data_version}.npy'),
    overwrite=False
)
print(embeds.shape)

corpus loading from /content/drive/MyDrive/ml_course_data/corpus_embeds_2.npy
Num embeddings 49622
(49622, 768)


In [12]:
query_string = 'headache'

query_vector = embedding_model.embed_documents([query_string])
len(query_vector[0])

768

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

class Catalog:
  def __init__(self, corpus):
      self.texts = np.array([i['text'] for i in corpus])
      self.doc_ids = np.array([i['doc_id'] for i in corpus])

  def get_items(self, item_indices):
      docs = [{'doc_id': str(self.doc_ids[i]), 'text': str(self.texts[i])} for i in item_indices]
      return docs

class VectorIndex:
  def __init__(self, embedder_model, corpus, corpus_embeddings):
      self.vectorizer = embedder_model
      self.vector_index = corpus_embeddings
      self.items = np.array([i['doc_id'] for i in corpus])
      print("Index built successfully. ✅")

  def retrieve(self, query_string, threshold=0.1, top_k=10):
      query_vector = self.vectorizer.embed_documents([query_string])
      cosine_similarities = cosine_similarity(np.array(query_vector).reshape(1, -1), embeds).flatten()
      top_k_indices = cosine_similarities.argsort()[::-1][:top_k]  # descending order
      top_k_scores = [round(i, 4) for i in cosine_similarities[top_k_indices]]
      return [{'idx': i, 'score': s} for s, i in zip(top_k_scores, top_k_indices)]

vector_index = VectorIndex(embedding_model, corpus, corpus_embeddings=embeds)
content_catalog = Catalog(corpus)
user_query = 'headache'
products = vector_index.retrieve(user_query)
print(products)

Index built successfully. ✅
[{'idx': np.int64(42991), 'score': np.float64(0.545)}, {'idx': np.int64(27989), 'score': np.float64(0.5297)}, {'idx': np.int64(17146), 'score': np.float64(0.4861)}, {'idx': np.int64(9721), 'score': np.float64(0.4744)}, {'idx': np.int64(8462), 'score': np.float64(0.4726)}, {'idx': np.int64(30643), 'score': np.float64(0.4462)}, {'idx': np.int64(14362), 'score': np.float64(0.4373)}, {'idx': np.int64(33485), 'score': np.float64(0.4372)}, {'idx': np.int64(23111), 'score': np.float64(0.4312)}, {'idx': np.int64(12438), 'score': np.float64(0.4306)}]


In [14]:
product_ids = [i['idx'] for i in products]
candidates = content_catalog.get_items(product_ids)
for i in candidates:
  print(i)

{'doc_id': 'B076HF85SK', 'text': 'Awful terrible doesn’t help gave me a big headache for over a werk'}
{'doc_id': 'B002C6467S', 'text': "This is the only medicine that cures my headaches it's the best"}
{'doc_id': 'B00890YO0U', 'text': 'Great product for any type of pain including a headache'}
{'doc_id': 'B01GIP0SW0', 'text': 'Stuffed up nose, this works really well to clear you up'}
{'doc_id': 'B074HSHL59', 'text': 'Need somemtype of migraine relief for the Mrs shemloves the spray.'}
{'doc_id': 'B01NAXDO1T', 'text': 'Wow works on my sore neck and headaches since I was badly injured.'}
{'doc_id': 'B07FYKC837', 'text': 'This product really helps me with my headaches.  I highly recommend it.'}
{'doc_id': 'B07JFYBN1C', 'text': 'I swear by these headache wraps. We purchased 2! The pink one & this black one. The pressure & ice felt when using this almost always receives our migraine head pain. This is something we will always have in our home. We tell everyone that has a migraine about this

## Generation

In [15]:
from llm import get_openai_client, generate

openai_client = get_openai_client(envs['OPENAI_API_KEY'])

knowledgebase = '\n'.join(["item_id: %s; review: %s" % (i['doc_id'], i['text']) for i in candidates])

system_prompt = f"""
  You are a helpful assistant for medicine shopping. Answer to user query based on provided context

  {knowledgebase}
"""

user_prompt = f"User query: {user_query}"


In [18]:
genai_resp = generate(openai_client, user_prompt=user_prompt, system_prompt=system_prompt)
print(genai_resp.text)

Based on the reviews provided, here are some options for headache relief:

1. **Item ID: B002C6467S**
   - Review: This is the only medicine that cures my headaches it's the best

2. **Item ID: B00890YO0U**
   - Review: Great product for any type of pain including a headache

3. **Item ID: B07FYKC837**
   - Review: This product really helps me with my headaches. I highly recommend it

4. **Item ID: B07JFYBN1C**
   - Review: I swear by these headache wraps. We purchased 2! The pink one & this black one. The pressure & ice felt when using this almost always relieves our migraine head pain.

These products have received positive feedback for providing relief from headaches.


# Homework

Use huggingface for an inference

use [InferenceClient](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.example) for free

An alternative: run ollama locally on your computer

https://ollama.com/library/llama3.2:1b