In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math

In [2]:
import implicit

In [3]:
train_joke_df = pd.read_csv(r'..\data\recsys-in-practice\train_joke_df.csv')

In [59]:
train_joke_df['UID'] = train_joke_df['UID'].astype(str)
train_joke_df['JID'] = train_joke_df['JID'].astype(str)

In [6]:
train_df, valid_df = train_test_split(positives, test_size=0.15, random_state=42)

In [22]:
positives = train_df[train_df["Rating"] > 5].copy()
positives['Rating'] = np.ones((len(positives)))
positives

Unnamed: 0,UID,JID,Rating
5,17990,36,1.0
12,3437,29,1.0
14,782,41,1.0
18,23812,57,1.0
29,6627,28,1.0
...,...,...,...
1448343,20585,30,1.0
1448345,14140,14,1.0
1448350,15184,65,1.0
1448355,1658,50,1.0


In [21]:
negatives = train_df[train_df["Rating"] < -5].copy()
negatives['Rating'] = np.ones((len(negatives)))
negatives

Unnamed: 0,UID,JID,Rating
4,11365,38,-6.60
7,8665,35,-8.35
19,15949,24,-9.66
21,17533,45,-5.39
24,4468,14,-6.50
...,...,...,...
1448338,18497,7,-8.98
1448340,20299,67,-9.56
1448342,18920,51,-8.79
1448353,20587,16,-8.59


In [80]:
history = train_joke_df.groupby('UID').agg({'JID':list}).rename(columns={'JID':'history'})
history

Unnamed: 0_level_0,history
UID,Unnamed: 1_level_1
1,"[53, 30, 15, 7, 29, 38, 20, 32, 54, 59, 3, 82,..."
10,"[9, 39, 34, 57, 63, 23, 45, 43, 6, 31, 1, 18, ..."
100,"[69, 28, 54, 39, 8, 36, 46, 53, 32, 20, 66, 16..."
1000,"[60, 62, 8, 32, 54, 63, 26, 38, 57, 30, 37, 12..."
10000,"[14, 18, 68, 66, 53, 40, 49, 69, 12, 50, 23, 2..."
...,...
9995,"[32, 5, 13, 48, 27, 77, 21, 63, 36, 38, 11, 65..."
9996,"[70, 39, 7, 51, 13, 89, 38, 25, 26, 17, 69, 31..."
9997,"[20, 8, 38, 17, 55, 1, 22, 29, 53, 49, 85, 45,..."
9998,"[85, 67, 40, 19, 31, 94, 27, 52, 1, 56, 26, 47..."


In [69]:
class Encoder:
    def __init__(self, arr):
        self.idx = {}
        self.pid = {}
        for idx, pid in enumerate(arr):
            self.idx[pid] = idx
            self.pid[idx] = pid

    def toIdx(self, x):
        if type(x) == str:
            pid = x
            return self.idx[pid]
        return [self.idx[pid] for pid in x]

    def toPid(self, x):
        if type(x) == int:
            idx = x
            return self.pid[idx]
        return [self.pid[idx] for idx in x]

    def __len__(self):
        return len(self.idx)

In [70]:
user_encoder = Encoder(np.unique(train_joke_df['UID']))

In [72]:
item_encoder = Encoder(np.unique(train_joke_df['JID']))

In [73]:
item_encoder.toIdx('1')

0

In [74]:
item_encoder.toPid(0)

'1'

In [75]:
len(item_encoder)

100

In [115]:
def make_coo_row(history, encoder: Encoder):
    idx = []
    values = []

    items = []
    for trans in history:
        items.extend([i for i in trans])

    #display(items)
    for item in items:
        idx.append(encoder.toIdx(item))
        values.append(1.0)

    #display(idx)
    #display(values)
    return sparse.coo_matrix(
        (np.array(values).astype(np.float32), ([0] * len(idx), idx)), shape=(1, len(encoder)),
    )

In [116]:
make_coo_row(history.loc['1'].values, item_encoder).toarray()

array([[1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
        1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1.,
        1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1.,
        1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0.]], dtype=float32)

In [120]:
rows = []
for i, hist in enumerate(tqdm(history.values)):
    rows.append(make_coo_row(hist, item_encoder))

  0%|          | 0/24983 [00:00<?, ?it/s]

In [121]:
X_sparse = sparse.vstack(rows).tocsr()

In [122]:
X_sparse

<24983x100 sparse matrix of type '<class 'numpy.float32'>'
	with 1448364 stored elements in Compressed Sparse Row format>

In [9]:
X_sparse_pos = to_csr(positives)
X_sparse_neg = to_csr(negatives)

(22663, 100)

In [129]:
model = implicit.als.AlternatingLeastSquares(factors=16, regularization=0.0, iterations=8)
model.fit(X_sparse.T)



  0%|          | 0/8 [00:00<?, ?it/s]

In [130]:
model.similar_items(0)

(array([    0, 13644,  4476, 16531, 11516, 22630,  4347, 17384,   700,
         9983]),
 array([1.        , 0.88279974, 0.8800397 , 0.8745552 , 0.8613354 ,
        0.85230607, 0.8467042 , 0.84244394, 0.83925617, 0.8350803 ],
       dtype=float32))

In [None]:
model_neg = implicit.als.AlternatingLeastSquares(factors=16, regularization=0.0, iterations=8)
model_neg.fit(X_sparse_neg.T)

In [43]:
valid_df

Unnamed: 0,UID,JID,Rating
1359204,8706,26,7.82
896526,11106,8,7.48
1359992,9738,15,6.26
121837,12447,64,5.78
1152944,6160,32,8.35
...,...,...,...
1420145,10778,31,9.03
979625,20954,12,6.75
1299559,21544,46,9.03
1398909,11978,38,8.30


In [50]:
sim_items = model_pos.similar_items(joke_id)
sim_items[0]

array([   25,  7473,  6666, 14336, 22520, 10301,  9573, 13342, 20084,
       18025])

In [127]:
df = train_df
#model = model_pos

#for i, row in tqdm(valid_df.values):
user_id = 8706 - 1#int(row['UID']) - 1
joke_id = 26 - 1 #int(row['JID']) - 1

ratings = []
sim_items = model.similar_items(joke_id)
sim_users = model.similar_users(user_id)

for sim_item in sim_items[0]:
    u_df = df.loc[(df['UID'] == user_id + 1) & (df['JID'] == sim_item + 1)]
    display(u_df)
            

for sim_user in sim_users[0]:
    j_df = df.loc[(df['UID'] == sim_user + 1) & (df['JID'] == joke_id + 1)]
    display(j_df)
    
    
#if i > 10:
#    break

TypeError: 'NoneType' object is not subscriptable

In [42]:
train_joke_df.loc[(train_joke_df['UID'] == 1) & (train_joke_df['Rating'] > 5)].sort_values('JID')

Unnamed: 0,UID,JID,Rating
1055207,1,2,8.79
1418149,1,14,8.45
1150603,1,27,7.82
80144,1,29,9.13
839221,1,35,5.05
138284,1,54,8.3
349879,1,61,8.59
375964,1,68,8.3
1003269,1,69,5.68


In [41]:
train_joke_df.loc[(train_joke_df['UID'] == 17671) & (train_joke_df['Rating'] > 5)].sort_values('JID')

Unnamed: 0,UID,JID,Rating
770624,17671,7,7.38
378259,17671,14,7.96
398403,17671,19,8.59
972905,17671,26,8.54
1152645,17671,27,8.01
963301,17671,28,7.14
552741,17671,29,8.25
854006,17671,36,6.8
825523,17671,47,6.31
191168,17671,53,6.36


In [19]:
model_pos.similar_items(0)

(array([    0, 17670, 22296, 21780, 11494, 15553,  9187,  3695,  1841,
        14449]),
 array([0.9999999 , 0.9211741 , 0.8317306 , 0.8303811 , 0.82221484,
        0.81230956, 0.8023888 , 0.79893845, 0.7965063 , 0.79022676],
       dtype=float32))

In [20]:
model_pos.similar_users(0)

(array([ 0,  2, 63, 40,  1, 22, 32,  8, 58, 54]),
 array([1.        , 0.97699744, 0.97001505, 0.9645086 , 0.95583564,
        0.95353734, 0.9501611 , 0.94154257, 0.93922323, 0.93723714],
       dtype=float32))