In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gzip 
import os
import json

from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [2]:
# Загрузка данных
# Industrial and Scientific: 5-core (77,071 reviews), ratings only (1,758,333 ratings), metadata (167,524 products)
# metadata - http://deepyeti.ucsd.edu/jianmo/amazon/index.html

!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Industrial_and_Scientific.csv
#!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Industrial_and_Scientific_5.json.gz

--2022-06-24 06:46:42--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Industrial_and_Scientific.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73390279 (70M) [application/octet-stream]
Saving to: ‘Industrial_and_Scientific.csv’


2022-06-24 06:46:44 (64.1 MB/s) - ‘Industrial_and_Scientific.csv’ saved [73390279/73390279]



In [3]:
data_r = pd.read_csv("./Industrial_and_Scientific.csv", names=["item_id", "person_id", "mark", "time"], parse_dates=['time'])
data_r.head()

Unnamed: 0,item_id,person_id,mark,time
0,176496920,A3FANY5GOT5X0W,5.0,1358899200
1,176496920,AT6HRPPYOPHMB,5.0,1352073600
2,176496920,A4IX7B38LIN1E,4.0,1350432000
3,176496920,A12Q4LR8N17AOZ,5.0,1490745600
4,176496920,A1GJXZZPOZ3OD9,1.0,1490054400


In [4]:
data_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1758333 entries, 0 to 1758332
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   item_id    object 
 1   person_id  object 
 2   mark       float64
 3   time       object 
dtypes: float64(1), object(3)
memory usage: 53.7+ MB


In [5]:
for col in ['item_id', 'person_id']:
    print(col, len(data_r[col].unique()))

item_id 165764
person_id 1246131


In [6]:
# Сколько оценок оставляют пользователи 
a = data_r.groupby("person_id")["item_id"].count()
print(a.min(),a.mean(), a.max())

1 1.411033831916548 156


In [7]:
# Сколько оценок остаполучают товары
a = data_r.groupby("item_id")["person_id"].count()
print(a.min(),a.mean(), a.max())

1 10.607447938032383 14331


In [8]:
# Давольно много пользователей с одной-двумя оценками. Их будет трудно делить на трей-тест. 
# К тому же думается, что user-based для них не даст интересных результатов. 
# user-based будем применять для пользователей с количеством отывов больще n
# Ну и пямять экономим.

n = 10

data_r["person_marks"] = data_r.groupby("person_id")["item_id"].transform("count")
data_reduced = data_r[data_r["person_marks"] > n]
print(data_reduced.shape[0], "items were taken from", data_r.shape[0])

59219 items were taken from 1758333


In [9]:
data_reduced.mark.value_counts()

5.0    42781
4.0     9566
3.0     3515
1.0     1912
2.0     1445
Name: mark, dtype: int64

In [10]:
for col in ['item_id', 'person_id']:
    print(col, len(data_reduced[col].unique()))

item_id 25967
person_id 3648


In [11]:
rmse = lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))

def train_test_split_rs(X, ratio=0.2, user_col='userId', item_col='movieId',
                     rating_col='rating', time_col='timestamp'):
    # сортируем оценки по времени
    X.sort_values(by=[time_col], inplace=True)
    # список всех юзеров
    userIds = X[user_col].unique()
    X_train_data = []
    X_test_data = []
    y_train = []
    y_test = []
    for userId in tqdm_notebook(userIds):
        curUser = X[X[user_col] == userId]
        # определяем позицию, по которой делим выборку и размещаем данные по массивам
        idx = int(curUser.shape[0] * (1 - ratio))
        X_train_data.append(curUser[[user_col, item_col]].iloc[:idx, :].values)
        X_test_data.append(curUser[[user_col, item_col]].iloc[idx:, :].values)
        y_train.append(curUser[rating_col].values[:idx])
        y_test.append(curUser[rating_col].values[idx:])
    # cтекуем данные по каждому пользователю в общие массивы
    X_train = pd.DataFrame(np.vstack(X_train_data), columns=[user_col, item_col])
    X_test = pd.DataFrame(np.vstack(X_test_data), columns=[user_col, item_col])
    y_train = np.hstack(y_train)
    y_test = np.hstack(y_test)
    return X_train, X_test, y_train, y_test

In [12]:
%%time
X_train, X_test, y_train, y_test = train_test_split_rs(data_reduced, ratio=0.2, user_col='person_id', item_col='item_id',
                     rating_col='mark', time_col='time')

#X_train, X_test, y_train, y_test = train_test_split(data_reduced[['person_id', 'item_id']], data_reduced['mark'], stratify=data_reduced['person_id'], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


  0%|          | 0/3648 [00:00<?, ?it/s]

(45612, 2) (13607, 2)
CPU times: user 1min 8s, sys: 725 ms, total: 1min 9s
Wall time: 1min 8s


# Collaborative filtering User-based model

In [13]:
class UserBased(BaseEstimator):
    def fit(self, X, y, user_col='userId', item_col='movieId'):
        self.false_predict = 0
        X = X.copy()
        # сохраним текущих пользователей и имеющиеся предметы
        self.users = X[user_col].unique()
        self.items = X[item_col].unique()
        
        X['y'] = y
        # рассчитаем среднее значение рейтинга для пользователя и предмета
        self.mean_y_user = X.groupby(user_col)['y'].mean()
        self.mean_y_item = X.groupby(item_col)['y'].mean()
        
        # вычитаем среднюю оценку пользователя
        X['y'] -= X[user_col].apply(lambda x: self.mean_y_user[x])
        
        # создаём векторы для каждого пользователя из просмотренных фильмов
        # для неизвестных фильмов ставим оценку 0
        self.user_ratings = pd.pivot_table(X, values='y', index=user_col,
                                           columns=item_col, fill_value=0.00001) #0 -self.mean_y_user.mean()
        
        
        # считаем попарную схожесть между юзерами
        self.user_sim = cosine_similarity(self.user_ratings)
        
        # также сделаем словарь - {значение user_col: index в user_ratings}
        self.user_pos = dict()
        for user in self.users:
            self.user_pos[user] = np.argwhere(self.user_ratings.index.values == user)[0][0]
        return self
    
    def predict_rating(self, pr_user, pr_item):
        # если в обучающей выборке нет такого предмета
        # или пользователя, то вернём 0
        if not pr_item in self.items or not pr_user in self.users:
            self.false_predict += 1
            return 0
        
        # считаем числитель и знаменатель дроби из формулы предсказания
        numerator = self.user_sim[self.user_pos[pr_user]].dot(
                        self.user_ratings.loc[:, pr_item])   
        # вычитаем 1, так как схожесть пользователя с самим собой равна 1,
        # но модель не должна это учитывать
        denominator = np.abs(self.user_sim[self.user_pos[pr_user]]).sum() - 1
        
        ans = self.mean_y_user[pr_user] + numerator / denominator
        
        return ans
    
    def predict(self, X, user_col='userId', item_col='movieId'):
        y = X[[user_col, item_col]].apply(lambda row: self.predict_rating(row[0], row[1]), axis=1)
        print(self.false_predict, " items wasn't in train.")
        self.false_predict = 0
        return y


In [14]:
%%time
print('start fitting...')
ub = UserBased().fit(X_train, y_train, user_col='person_id', item_col='item_id')


start fitting...
CPU times: user 33.1 s, sys: 4.35 s, total: 37.4 s
Wall time: 22 s


In [15]:
print('start predicting...')
pred = ub.predict(X_test, user_col='person_id', item_col='item_id')
print('rmse = {}'.format(mean_squared_error(y_test, pred, squared=False)))


start predicting...
5123  items wasn't in train.
rmse = 2.9072999963338804


In [16]:
un_lst = list(X_train['item_id'].unique())
X_test["in_train"] = X_test.apply(lambda a: 1 if a["item_id"] in un_lst else 0, axis=1)
#print(X_test_.shape, X_test.shape)
X_test["y"] = y_test

In [17]:
X_test_cleared = X_test[X_test["in_train"] == 1]
y_test_cleared = X_test_cleared['y']

In [18]:
print('start predicting...')
pred = ub.predict(X_test_cleared, user_col='person_id', item_col='item_id')
print('rmse = {}'.format(mean_squared_error(y_test_cleared, pred, squared=False)))

start predicting...
0  items wasn't in train.
rmse = 0.8800472539235036


# Collaborative filtering with Surprise

In [19]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Installing collected packages: surprise
Successfully installed surprise-0.1
[0m

In [20]:
from surprise import Dataset
from surprise import KNNBasic, SVD
from surprise import Reader
from surprise.model_selection import cross_validate, split
from surprise.model_selection import train_test_split
from surprise import accuracy


In [21]:
# Подгружаем данные. Lля этого алгоритма также урезанную версию. Все опять не помещаются в память.

reader = Reader(rating_scale=(1, 5))
data_suprise_cf = Dataset.load_from_df(data_reduced[[ 'person_id', 'item_id', 'mark']], reader)

In [22]:
%%time

#A basic collaborative filtering algorithm.
algo_knn = KNNBasic()

# test set is made of 20% of the ratings.
trainset, testset = train_test_split(data_suprise_cf, test_size=0.2)

# Train the algorithm on the trainset, and predict ratings for the testset
algo_knn.fit(trainset)

predictions = algo_knn.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9840
CPU times: user 1.45 s, sys: 91.2 ms, total: 1.54 s
Wall time: 1.53 s


0.9840224001155745

# Surprise SVD

In [23]:
# Подгружаем данные, для этого алгоритма уже все.

reader = Reader(rating_scale=(1, 5))

data_suprise = Dataset.load_from_df(data_r[[ 'person_id', 'item_id', 'mark']], reader)

In [24]:
%%time

algo_svd = SVD()

trainset, testset = train_test_split(data_suprise, test_size=0.2)

algo_svd.fit(trainset)

predictions = algo_svd.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 1.1649
CPU times: user 3min 24s, sys: 923 ms, total: 3min 25s
Wall time: 3min 25s


1.1648709642994037

In [25]:
predictions[:3]

[Prediction(uid='A33OGXNP18KEUU', iid='B01G82SJRY', r_ui=5.0, est=4.7814127792756835, details={'was_impossible': False}),
 Prediction(uid='A1GENHNZ6DDIQ9', iid='B001VXY3F8', r_ui=5.0, est=4.688906744232365, details={'was_impossible': False}),
 Prediction(uid='A343CMN9MBKSQG', iid='B00SX3T2LO', r_ui=5.0, est=4.591066460149551, details={'was_impossible': False})]

# Content-based NLP

In [26]:
#!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_Industrial_and_Scientific.json.gz
load_files = False

In [27]:
# Даныых много, поэтому при повторном запуске будем загружать только нужные,  которые сохраним себе в датафрейм.

if load_files:
    
    product_data = []
    with gzip.open('./meta_Industrial_and_Scientific.json.gz') as f:
        for l in f:
            product_data.append(json.loads(l.strip()))

    # total length of list, this number equals total number of products
    print(len(product_data))
    # first row of the list
    print(product_data[0])
    
    product_asin_list = [item['asin'] for item in product_data]
    product_title_list = [item['title'] for item in product_data]

    pd.DataFrame(zip(product_asin_list, product_title_list), columns=["asin", "title"]).to_csv("./product_data.csv")
    print("done")

In [28]:
if not load_files:
    product_data_loaded = pd.read_csv("../input/amazonworkdata/product_data.csv", index_col=0)
    product_asin_list = product_data_loaded['asin'].values.tolist()
    product_title_list = product_data_loaded['title'].values.tolist()
    print(product_data_loaded.head())


         asin                                              title
0  0176496920       Turning Technologies Response Card (RCRF-03)
1  0692782109  R-Cat 692782109 EKG Badge with Arrhythmia Pock...
2  0781776848  Anatomical Chart Company's Illustrated Pocket ...
3  0781786606   Joints of the Lower Extremities Anatomical Chart
4  0840026080       Turning Technologies ResponseCard XR RCXR-01


In [29]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.1.tar.gz (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m769.5 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Collecting huggingface-hub>=0.8.1
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.1-py3-none-any.whl size=125774 sha256=8a4c5cd73d6c2d956ae7c8e817a4db93c4344420d646f63cb1f124b9b720a193
  Stored in directory: /root/.cache/pip/wheels/58/27/2f/708b4f002c226e57b6243769da345c650633175c7634f93365
Successfully built sentence-transformers
Installing collected pack

In [30]:
from sentence_transformers import SentenceTransformer

if load_files:
    model = SentenceTransformer('sentence-transformers/LaBSE')


In [31]:
%%time

outfile = "./embeddings.npy"
infile ="../input/amazonworkdata/embeddings.npy"
# Процесс энкодиинга всех данных занимает много времени. Посэтому при повторных запусках будем подшружать уже готовые.
# CPU times: user 4h 53min 31s, sys: 44.7 s, total: 4h 54min 16s
# Wall time: 2h 27min 45s

if load_files:
    
    embeddings = model.encode(product_title_list)
    
    with open(outfile, 'wb') as f:
        np.save(f, embeddings)
else:
    with open(infile, 'rb') as f:
        embeddings = np.load(f)

CPU times: user 37.7 ms, sys: 674 ms, total: 711 ms
Wall time: 5.62 s


In [32]:
print(embeddings.shape)

(167442, 768)


In [33]:
def top_n_nearesst_index(embeddings, item_to_predict = 0, n = 5): 

    cs_list = []
    #print(embeddings[item_to_predict].reshape(1, -1))
    
    for item in embeddings:
        #cs = cosine_similarity(item.reshape(1, -1), embeddings[item_to_predict].reshape(1, -1))
        
        # np cosine_similarity works faster
        cs = np.correlate(item, embeddings[item_to_predict])

        cs_list.append(cs[0])
    
    cs_list_sorted = sorted(cs_list)

    n_largest_cos_sim = cs_list_sorted[-n-1]
        
    ans =  np.argwhere(cs_list >= n_largest_cos_sim)
       
    return ans.reshape(1, -1)[0][1:]


In [34]:
# Посмотрим результат для выборки товаров:
item_rec_list = []
for item_num in tqdm_notebook([0, 2, 14, 44, 77, 188]):
    item_nearesst_indexes = top_n_nearesst_index(embeddings, item_num, 5)
    a = [product_title_list[item_num]]
    print()
    print(product_title_list[item_num], "id = ", item_num)
    print()
    for item in item_nearesst_indexes:
        print("id = ", item)
        print(product_title_list[item])
        a.append(product_title_list[item])
    item_rec_list.append(a)
    print("===================")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/6 [00:00<?, ?it/s]


Turning Technologies Response Card (RCRF-03) id =  0

id =  4
Turning Technologies ResponseCard XR RCXR-01
id =  95004
TurningPoint ResponseCard XR NXT Classroom 32 RCXR-02 Clicker Set LCD RF Response System complete w/ bag, software, RRRF-04 Receiver
id =  109487
RF Industries - RFT-1202-2T - RF Industries RFT-1202-2T
id =  113827
Carling Technologies V4D1A60B-AEC00-000 Switch Rocker
id =  157630
RT Off-Road Vehicle Recovery Kit - RT33013

Anatomical Chart Company's Illustrated Pocket Anatomy: The Spinal Nerves &amp; the Autonomic Nervous System Study Guide id =  2

id =  8
The Spinal Nerves Anatomical Chart
id =  10
The Spinal Nerves Anatomical Chart
id =  4500
The Nervous System Anatomical Chart Laminated
id =  11400
Illustrated Pocket Anatomy - Vertebral Column and Spine Disorders
id =  27145
Muscular System Chart &amp; Skeletal System Anatomical Chart with Study Guide

VersaTiles Math Small Group Kit (Grade 4) id =  14

id =  29537
School Smart Modeling Clay, Multiple Color (Pack