# Recommendation system

download dataset [Health_and_Personal_Care.jsonl.gz](https://drive.google.com/file/d/12N52kB4D1iqgzSuoWEfNSY3KqVRp10wL/view?usp=drive_link)

put in to `data` dir

In [5]:
%load_ext autoreload
%autoreload 2

import sys
import os

import numpy as np
from google.colab import drive

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)  # гарантируем воспроизводимость

run_env = os.getenv('RUN_ENV', 'COLLAB')
if run_env == 'COLLAB':
  from google.colab import drive
  ROOT_DIR = '/content/drive'
  drive.mount(ROOT_DIR)
  print('Google drive connected')
  DRIVE_DATA_DIR = 'ml_course_data'
  root_data_dir = os.path.join(ROOT_DIR, 'MyDrive', DRIVE_DATA_DIR)
  sys.path.append(os.path.join(ROOT_DIR, 'MyDrive', 'src'))
else:
  root_data_dir = os.getenv('DATA_DIR', '/srv/data')

if not os.path.exists(root_data_dir):
  raise RuntimeError('Отсутствует директория с данными')
else:
  print('Содержимое директории %s: %s', root_data_dir, os.listdir(root_data_dir))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google drive connected
Содержимое директории %s: %s /content/drive/MyDrive/ml_course_data ['nyt-ingredients-snapshot-2015.csv', 'insurance.csv', 'non_linear.csv', 'client_segmentation.csv', 'eigen.pkl', 'clustering.pkl', 'boosting_toy_dataset.csv', 'politic_meme.jpg', 'gray_goose.jpg', 'memes', 'optimal_push_time', 'sklearn_data', 'my_little_recsys', 'corpora', 'logs', 'nltk_data', 'recsys_data', 'MNIST', 'hymenoptera_data', 'pet_projects', 'ocr_dataset_sample.csv', 'geo_points.csv.gzip', 'scored_corpus.csv', 'labeled_data_corpus.csv', 'memes_stat_dataset.zip', 'als_model.pkl', 'raw_data.zip', 'json_views.tar.gz', 'test_data.csv', 'sales_timeseries_dataset.csv.gz', 'brand_tweets_valid.csv', 'brand_tweets.csv', 'Health_and_Personal_Care.jsonl.gz', 'models', 'corpus_embeds_0

In [6]:
from utils import read_raw_data

file_name = 'Health_and_Personal_Care.jsonl.gz'
data_path = os.path.join(root_data_dir, file_name)

json_data = read_raw_data(data_path, limit=1000)
print(len(json_data))

Dataset num items: 1000 from /content/drive/MyDrive/ml_course_data/Health_and_Personal_Care.jsonl.gz
1000


In [7]:
json_data[0]

{'rating': 4.0,
 'title': '12 mg is 12 on the periodic table people! Mg for magnesium',
 'text': 'This review is more to clarify someone else’s review bc they didn’t understand understand the labeling!  It shows 1000mg as advertised & another little label says 12mg bc 12 is on the periodic table for magnesium!  I realize not everyone takes chemistry, but 4 ppl liked his review & so misinformation is spreading.  This works. If however you are on opiate level medications that are causing constipation you should talk to your pain dr or your gastrointestinal dr & ask for a medication called Linzess which works must better & must faster, but is unnecessary for most people.  If magnesium is working for you just make sure to take it with food & drink 6-8 glasses of water per day.  Staying hydrated will really help.  Before switching to Linzess I used to take one 1,000 mg pill am & pm every day with meals & always with an 8 ounce glass of water or other liquid.',
 'images': [],
 'asin': 'B07TD

In [8]:
import pandas as pd

user_item_data_df = pd.DataFrame([('health', i['user_id'], i['parent_asin'], i['rating']) for i in json_data], columns=['category', 'CustomerID', 'ProductID', 'target'])

user_item_data_df.head()

Unnamed: 0,category,CustomerID,ProductID,target
0,health,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B07TDSJZMR,4.0
1,health,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,B08637FWWF,5.0
2,health,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,B07KJVGNN5,5.0
3,health,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,B092RP73CX,4.0
4,health,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,B08KYJLF5T,1.0


Recommender baseline

In [9]:
popularity_df = (
    user_item_data_df
    .groupby('ProductID')
    .agg(num_entries=('CustomerID', 'count'))
    .sort_values(by='num_entries', ascending=False)
)

popularity_df.head()

Unnamed: 0_level_0,num_entries
ProductID,Unnamed: 1_level_1
B07XVVVB8W,22
B08X9LB1WC,7
B0BSRPX53Z,7
B0B4328BFW,5
B08KBQNDJC,4


In [10]:
from recsys.utils import prepare_evaluation_df

evaluation_df = prepare_evaluation_df(user_item_data_df).to_pandas()

evaluation_df.head()

Transformation started...
Negative candidates: 380264, Positive samples: 1000
Num negatives 0.6702412868632708


Unnamed: 0,category,CustomerID,ProductID,target
0,health,AE25NQAZI3725GZIL5FS52ZIKWKQ,B007QESMDK,1
1,health,AE25NQAZI3725GZIL5FS52ZIKWKQ,B08JVF5B7F,0
2,health,AE25NQAZI3725GZIL5FS52ZIKWKQ,B08LSNW25R,0
3,health,AE25NQAZI3725GZIL5FS52ZIKWKQ,B077B5NV9Y,0
4,health,AE25NQAZI3725GZIL5FS52ZIKWKQ,B07CT9HTM9,0


# Features

prepare sparse features

In [13]:
file_name = 'meta_Health_and_Personal_Care.jsonl.gz'
data_path = os.path.join(root_data_dir, file_name)

json_meta_data = [i for i in read_raw_data(data_path) if i['parent_asin'] in user_item_data_df['ProductID'].values]
print(len(json_meta_data))

Dataset num items: 60293 from /content/drive/MyDrive/ml_course_data/meta_Health_and_Personal_Care.jsonl.gz
851


In [14]:
catalog_df = pd.json_normalize(json_meta_data)

catalog_df.head(3)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,...,details.Coating Description,details.Photo Filter Effect Type,details.Filter Type,details.Is Foldable,details.Target Species,details.Cartoon Character,details.Filter Class,details.Test type,details.Allergen Information,details.Mounting Type
0,Health & Personal Care,"GoodSense Premium Saline, Nasal Moisturizing S...",4.1,29,"[INGREDIENTS: Compare to Ocean ingredients., U...",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Good Sense,...,,,,,,,,,,
1,Health & Personal Care,Premium Dry brush for take a bath and Lymphati...,2.7,7,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Smiley smith,...,,,,,,,,,,
2,Health & Personal Care,Cord Locks Silicone Toggles for Drawstrings El...,4.2,632,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Cord Locks Silicone Toggles', 'url...",Abodhu,...,,,,,,,,,,


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    max_features=100,  # Limit number of features
    max_df=0.8,        # Ignore terms that appear in more than 80% of documents
    min_df=1           # Ignore terms that appear in less than 1 document
)

tfidf_matrix = vectorizer.fit_transform([i['title'] for i in json_meta_data]).toarray()
feature_names = vectorizer.get_feature_names_out()
print(tfidf_matrix.shape)

(851, 100)


In [16]:
feature_store = {j['parent_asin']: tfidf_matrix[i,:] for i, j in enumerate(json_meta_data)}
print(len(feature_store))

851


In [19]:
try:
  import catboost
except:
  !pip install catboost
  clear_output()

In [20]:
from recsys.model import get_model, get_data
from IPython.display import clear_output

model = get_model()
data_pool, target = get_data(evaluation_df, feature_store)
model.fit(data_pool)
clear_output()
print('model training finished')
print(model)

model training finished
<catboost.core.CatBoostClassifier object at 0x7b0e13ca03a0>


In [21]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score

proba = model.predict_proba(data_pool)
print('roc_auc: %.4f' % roc_auc_score(target, proba[:,1]))

roc_auc: 0.6129


# Search

works only in local mode

In [23]:
import requests

def get_search_results(query, num=10):
    url = "http://localhost:8000/search"
    candidates = {}
    payload = {
        "text": query,
        "num": num
    }
    try:
      response = requests.post(url, json=payload)
      candidates=response.json()
    except:
      pass
    return candidates

candidates = get_search_results('cough')
print(candidates)  # Print the search results


{}


# RAG pipeline

In [None]:
from llm import recs_generation

In [None]:
candidates = get_search_results('headache tablets', num=30)

print(recs_generation(candidates))
pd.json_normalize(candidates).sort_values(by='asin')

connecting OpenAI...
Recommendation: Opt for item_id B09GBMG83Z with 300 small, easy-to-swallow Vitamin C tablets, despite slight chalkiness. Positive sentiment outweighs minor issue.


Unnamed: 0,content,asin
10,I purchased these tablets while traveling in L...,B0017TO05Y...
29,I rate this product 5 stars because it has hel...,B002ALZOVW...
27,I have seen these in those Zee Medical boxes a...,B002C6467S...
3,*Their heading has been revised and the price ...,B002C6467S...
11,This is a great headache relief product. I hav...,B002C6467S...
15,I have to cut big tablets that are in a oblong...,B0035T4B30...
18,I would have liked an instructional paper sent...,B00770BWGW...
1,I saw and bought this product at Vitamin Shopp...,B00CJEON1C...
13,"Picture shows tablets, description doesn't say...",B00I2LOHNU...
12,These were not my favorite. The tablets were k...,B00KZH9U9K...
