In [2]:
import os
import sys

import numpy as np



run_env = os.getenv('RUN_ENV', 'COLLAB')
if run_env == 'COLLAB':
  from google.colab import drive
  ROOT_DIR = '/content/drive'
  drive.mount(ROOT_DIR)
  print('Google drive connected')
  root_data_dir = os.path.join(ROOT_DIR, 'MyDrive', 'ml_course_data')
  sys.path.append(os.path.join(ROOT_DIR, 'MyDrive', 'src'))
else:
  root_data_dir = os.getenv('DATA_DIR', '/srv/data')

print(os.listdir(root_data_dir))

if not os.path.exists(root_data_dir):
  raise RuntimeError('Data dir not exists')
else:
  print('Data dir content %s: %s' % (root_data_dir, ', '.join(os.listdir(root_data_dir))))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google drive connected
Data dir content /content/drive/MyDrive/ml_course_data: nyt-ingredients-snapshot-2015.csv, insurance.csv, non_linear.csv, client_segmentation.csv, eigen.pkl, clustering.pkl, boosting_toy_dataset.csv, politic_meme.jpg, gray_goose.jpg, memes, optimal_push_time, sklearn_data, my_little_recsys, corpora, logs, nltk_data, recsys_data, MNIST, hymenoptera_data, pet_projects, ocr_dataset_sample.csv, geo_points.csv.gzip, scored_corpus.csv, labeled_data_corpus.csv, memes_stat_dataset.zip, als_model.pkl, raw_data.zip, json_views.tar.gz, sales_timeseries_dataset.csv.gz, brand_tweets_valid.csv, brand_tweets.csv, Health_and_Personal_Care.jsonl.gz


In [3]:
import os
with open(os.path.join(ROOT_DIR, 'MyDrive', 'secrets', 'secrets.env'), 'r') as f:
  envs = {j[0]: j[1] for j in [i.strip().split('=') for i in f.readlines()]}
print(len(envs))

2


In [4]:

import shutil
from IPython.display import clear_output

!pip install accelerate -U
!pip install transformers[torch]
!pip install huggingface_hub==0.25.2
!pip install sentence-transformers==2.2.2
# !pip install nltk==3.6.2
!pip install  backoff
!pip install  openai
clear_output()
print('Libs installed')

Libs installed


# RAG dev

[amazon reviews](https://amazon-reviews-2023.github.io/)

Plan
* rag over one dataset (healthcare)
* rag over two datasets (healthcare + electronics)

# EDA

In [5]:
import gzip
import json

def read_raw_data(file_name, limit: int, fields = None):

  file_path = os.path.join(root_data_dir, file_name)
  res = []
  with gzip.open(file_path, 'rt') as gz_file:
      for line in gz_file:
          data = json.loads(line.strip())
          if fields is not None:
            res.append({i: j for i, j in data.items() if i in fields})
          else:
            res.append(data)
          if limit == len(res):
              break
  print('Dataset num items: %d' % len(res))
  return res

read_raw_data('Health_and_Personal_Care.jsonl.gz', limit = 3)

Dataset num items: 3


[{'rating': 4.0,
  'title': '12 mg is 12 on the periodic table people! Mg for magnesium',
  'text': 'This review is more to clarify someone else’s review bc they didn’t understand understand the labeling!  It shows 1000mg as advertised & another little label says 12mg bc 12 is on the periodic table for magnesium!  I realize not everyone takes chemistry, but 4 ppl liked his review & so misinformation is spreading.  This works. If however you are on opiate level medications that are causing constipation you should talk to your pain dr or your gastrointestinal dr & ask for a medication called Linzess which works must better & must faster, but is unnecessary for most people.  If magnesium is working for you just make sure to take it with food & drink 6-8 glasses of water per day.  Staying hydrated will really help.  Before switching to Linzess I used to take one 1,000 mg pill am & pm every day with meals & always with an 8 ounce glass of water or other liquid.',
  'images': [],
  'asin': '

TODO: sampling strategy

In [6]:
import pandas as pd
db_sample_df = pd.json_normalize(
    read_raw_data('Health_and_Personal_Care.jsonl.gz', limit=1000, fields = None)
)
(
  db_sample_df
  .groupby('asin')['user_id']
  .count()
  .reset_index(name='popularity')
  .sort_values(by='popularity', ascending=False)
  .head()
)

Dataset num items: 1000


Unnamed: 0,asin,popularity
9,B000G2BESO,7
350,B07HSF5HTX,7
343,B07GHG9P9P,4
827,B09F7PH4QR,4
351,B07J32R15F,3


# Train embeddings

In [None]:
def get_pytorch_model(root_dir, model_name='all-mpnet-base-v2'):
  """
  model = get_pytorch_model(root_data_dir)
  """
  from sentence_transformers import SentenceTransformer

  models_dir = os.path.join(root_dir, 'models')
  if not os.path.exists(models_dir):
      os.mkdir(models_dir)
  model_path = os.path.join(models_dir, model_name)

  if not os.path.exists(model_path):
      print('huggingface model loading...')
      embedder = SentenceTransformer(model_name)
      embedder.save(model_path)
      clear_output()
  else:
      print('pretrained model loading...')
      embedder = SentenceTransformer(model_name_or_path=model_path)
  print('model loadind done')

  return embedder

def load_corpus(db):
    corpus_texts = []
    for item in db:
      corpus_texts.append(item['text'])
    return corpus_texts

def train_embeds(corpus_texts, embedder, sentence_embedding_path, overwrite=False):
    if os.path.exists(sentence_embedding_path) and not overwrite:
        print('corpus loading from %s' % sentence_embedding_path)
        passage_embeddings = np.load(sentence_embedding_path)
    else:
        print('num rows %d' % len(corpus_texts))
        passage_embeddings = embedder.encode(corpus_texts, show_progress_bar=True)
        passage_embeddings = np.array([embedding for embedding in passage_embeddings]).astype("float32")
        with open(sentence_embedding_path, 'wb') as f:
            np.save(f, passage_embeddings)
        print('corpus saved to %s' % sentence_embedding_path)
    print('Num embeddings %d' % passage_embeddings.shape[0])
    return passage_embeddings

model = get_pytorch_model(root_data_dir)
db = read_raw_data(
    'Health_and_Personal_Care.jsonl.gz',
    limit = -1, fields = ['rating', 'text', 'title', 'asin']
)
corpus = load_corpus(db)
data_version = 0
embeds = train_embeds(
    corpus, model,
    os.path.join(root_data_dir, f'corpus_embeds_{data_version}.npy'),
    overwrite=False
)

model loadind done
Dataset num items: 494121
num rows 494121


Batches:   0%|          | 0/15442 [00:00<?, ?it/s]

In [None]:
import datetime
import hashlib

import backoff
import openai
from openai import OpenAI


client = OpenAI(
    api_key=envs["OPENAI_API_KEY"]
)


@backoff.on_exception(backoff.expo, openai.APIError)
@backoff.on_exception(backoff.expo, openai.RateLimitError)
@backoff.on_exception(backoff.expo,openai.Timeout)
@backoff.on_exception(backoff.expo, RuntimeError)
def gpt_query(gpt_params, verbose: bool = False, avoid_fuckup: bool = False) -> dict:
    print('connecting OpenAI...')
    if verbose:
        print(gpt_params["messages"][1]["content"])
    response = client.chat.completions.create(
        **gpt_params
    )
    gpt_response = response.choices[0].message.content
    if avoid_fuckup:
        if '[' in gpt_response or '?' in gpt_response or '{' in gpt_response:
            raise RuntimeError
    res = {'recs': gpt_response}
    res.update({'prompt_tokens': response.usage.completion_tokens, 'prompt_tokens': response.usage.prompt_tokens, 'total_tokens': response.usage.total_tokens})
    seed_phrase = f'{str(datetime.datetime.now().timestamp())}{gpt_response}'
    schedule_id = str(hashlib.md5(seed_phrase.encode('utf-8')).hexdigest())[:12]
    res.update({'id': schedule_id})
    return res

def gen_candadates(db, ids):
  candidates = '\n'.join(["item_id: %s; review: %s" % (db[i]['asin'], db[i]['text']) for i in ids])
  return candidates

def generate(db, ids, verbose=False):
    gpt_params = {
        'model': 'gpt-3.5-turbo',
        'max_tokens': 500,
        'temperature': 0.7,
        'top_p': 0.5,
        'frequency_penalty': 0.5,
    }
    gpt_promt = promt_generation(gen_candadates(db, ids))
    if verbose:
        print(gpt_promt)
    messages = [
        {
          "role": "system",
          "content": "You are a helpful assistant for medicine shopping",
        },
        {
          "role": "user",
          "content": gpt_promt,
        },
    ]
    gpt_params.update({'messages': messages})
    res = gpt_query(gpt_params, verbose=False)

    return res

def top_similar(query_embed, candidates_embeds, top=10):
  from sklearn.metrics.pairwise import cosine_similarity

  sims = cosine_similarity(query_embed.reshape(1, -1), candidates_embeds)[0]
  top_similar_idx = [int(i) for i in np.argsort(-np.abs(sims))][:top]
  return top_similar_idx

def retrieve(query):
  query_embed = model.encode(query, show_progress_bar=False)
  indexes = top_similar(query_embed, embeds)
  return indexes

def get_retrieved_content(db, ids):
  res = [{db[i]['asin']: db[i]['text']} for i in ids]
  return res

def promt_generation(candidates):
  # TODO: use jinja2
  promt = f"""
      Next rows below is an item_id reviews.
      {candidates}
      Utilize reviews to determine the best item_id.
      Avoid including actual reviews; rephrase them succinctly.
      Keep the recommendation under 50 words. Avoid starting with "Based on reviews"; opt for a more creative approach!
      Recommendation:
  """
  return promt

query = "headache tablets"
ids = retrieve(query)
get_retrieved_content(db, ids)

In [None]:

res = generate(db, ids)
print(res['recs'].replace('. ', '.\n'))

In [None]:
query = "getting cold"
ids = retrieve(query)
res = generate(db, ids)
print(res['recs'].replace('. ', '.\n'))

In [None]:
query = "bad cough"
ids = retrieve(query)
res = generate(db, ids)
print(res['recs'].replace('. ', '.\n'))

In [None]:
get_retrieved_content(db, ids)

In [None]:
query = "insomnia"
ids = retrieve(query)
res = generate(db, ids)
print(res['recs'].replace('. ', '.\n'))

In [None]:
get_retrieved_content(db, ids)

In [None]:
query = "depression"
ids = retrieve(query)
res = generate(db, ids)
print(res['recs'].replace('. ', '.\n'))

In [None]:
get_retrieved_content(db, ids)