In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
from tqdm import tqdm

import timeit
import time
import math
from sklearn.model_selection import train_test_split
# ./indexer
from indexer import AppendIndexer
import ALS

# Annoy
from annoy import AnnoyIndex

#SKLearn 
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import average_precision_score
from sklearn.metrics import pairwise_distances

# scipy
from scipy.spatial import distance
from scipy.sparse import lil_matrix
import scipy
import scipy.sparse
import itertools
import random

In [2]:
class ViewMatrix:
    def __init__(self, path):
        self.path = path
        self.original = True
        self.item_indexer = AppendIndexer.load('./chetor.com/view_matrix/item_indexer.indexer')
        self.user_indexer = AppendIndexer.load('./chetor.com/view_matrix/user_indexer.indexer')
        
    def load_matrix(path):
        
        matrix = ViewMatrix(path)

        try:
            matrix.view_matrix = ViewMatrix \
                .load_sparse_lil(path)
        except:
            print('Error: loading', path)

        return matrix

    def load_sparse_lil(filename):
        loader = np.load(filename, allow_pickle=True)
        result = lil_matrix(tuple(loader["shape"]), dtype=str(loader["dtype"]))
        result.data = loader["data"]
        result.rows = loader["rows"]
        return result
    
    def make_dense(self, user_min_view, item_min_view):
        self.original = False
        while True:
            removed_rows_cnt = self.trim_users_with_few_views(user_min_view)
            removed_columns_cnt = self.trim_columns_with_few_views(item_min_view)
            if not removed_columns_cnt and not removed_rows_cnt:
                break

    def trim_users_with_few_views(self, user_min_view):
        removing_row_indices = list(np.where(self.view_matrix.getnnz(1) < user_min_view)[0])
        print('Number of users which should be deleted:', len(removing_row_indices))
        self.trim_user_indices(to_remove_indices=removing_row_indices)
        return len(removing_row_indices)

    def trim_columns_with_few_views(self, column_min_view):
        removing_column_indices = list(np.where(self.view_matrix.getnnz(0) < column_min_view)[0])
        print('Number products which should be deleted:', len(removing_column_indices))
        self.trim_column_indices(to_remove_indices=removing_column_indices)
        return len(removing_column_indices)
    
    def trim_user_indices(self, to_remove_indices):
        self.user_indexer.remove_indexes(to_remove_indices)
        self.view_matrix = ViewMatrix.delete_row_lil(self.view_matrix, to_remove_indices)
    
    def trim_column_indices(self, to_remove_indices):
        self.item_indexer.remove_indexes(to_remove_indices)
        self.view_matrix = ViewMatrix.delete_column_lil(self.view_matrix, to_remove_indices)
    
    def delete_column_lil(mat: lil_matrix, *i) -> lil_matrix:
        mat = mat.transpose()
        mat = ViewMatrix.delete_row_lil(mat, *i)
        return mat.transpose()
    
    def delete_row_lil(mat: lil_matrix, *i) -> lil_matrix:
        if not isinstance(mat, lil_matrix):
            raise ValueError("works only for LIL format -- use .tolil() first")
        mat = mat.copy()
        mat.rows = np.delete(mat.rows, i)
        mat.data = np.delete(mat.data, i)
        mat._shape = (mat.rows.shape[0], mat._shape[1])
        return mat
    def to_csr(self):
        train_data = self.view_matrix.astype(np.float64)
        train_data = train_data.tocoo()
        train_data.data = np.log10(train_data.data) + 1
        train_data = train_data.tocsr()
        return train_data

In [3]:
def ALSReady(path: str, l = 2):
    now = time.time()
    matrix = ViewMatrix.load_matrix(path)
    print('View matrix loaded in', time.time() - now, 'seconds.')

    now = time.time()
    sparce_matrix = matrix.to_csr()
    matrix.make_dense(user_min_view = l, 
                      item_min_view = l)
    implicit_matrix = matrix.to_csr()
    print('matrix has been made dense in', time.time() - now, 'seconds.')
    return matrix, sparce_matrix, implicit_matrix

def CFTrain(matrix, implicit_matrix, _alpha = 15, _facs = 20, _itr = 15, save = False):
    
    now = time.time()
    als_model = ALS.Als(num_factors = _facs,
                        iterations = _itr,
                        num_threads = 10,
                        alpha = _alpha)

    
    als_model.fit(implicit_matrix)
    alsTime = time.time() - now
    print('ALS model is fitted in', alsTime, 'seconds.')
    if save:
        print('Saving Data ...')
        matrix.item_indexer.dump('./chetor.com/alisResult/ALS/ali_item_indexer_factorized.indexer')
        matrix.user_indexer.dump('./chetor.com/alisResult/ALS/ali_user_indexer_factorized.indexer')
        np.save('./chetor.com/alisResult/ALS/ali_items_vectors.npy', als_model.item_vectors)
        np.save('./chetor.com/alisResult/ALS/ali_users_vectors.npy', als_model.user_vectors)

    return als_model.item_vectors, als_model.user_vectors, alsTime

implicit matrix ro migire va ye bakhshisho baramun test o train mikone ke ye bakhshi az cell ha sefr shodan:

In [4]:
def generate_test_set_precision_recall(implicit_matrix, _test_size = 0.05, test_cells = 0.2):
    train, test = train_test_split(implicit_matrix, shuffle=False, test_size = _test_size)
    print('test_shape', test.shape, 'train_shape (which we cant process bc of RAM)', train.shape)
    rows,cols = test.nonzero()
    delete_index = [(rows[i], cols[i]) for i in random.sample(range(1, len(rows)), int(len(rows)*test_cells))]
    print("total cells", len(rows), "number of deleted cells", len(delete_index))
    
    x_train = scipy.sparse.csr_matrix(test.shape)
    for i, j in zip(rows, cols):
        if (i, j) in delete_index:
            continue
        x_train[i, j] = test[i, j]
    return x_train, test, delete_index

In [5]:
def precision_at_k_old(test, test_approx, k = 10):
    metric = []
    metric2 = []
    for i in tqdm(range(len(test))):
        t = n_argmax(test[i], k)
        nonz = np.nonzero(test[i])[0]
        ta = n_argmax(test_approx[i], k)   
        rec_rel = 0
        for i in nonz:
            if i in ta:
                rec_rel += 1

        metric.append(rec_rel/len(nonz))
        metric2.append(rec_rel/k)
    return np.mean(np.array(metric)), np.mean(np.array(metric2))

def hit_rate_at_k(deleted, x_train, test_approx, k = 10):
    found = []
    for i in tqdm(range(len(test_approx))):
        ta = list(n_argmax(test_approx[i], 30))
        nonz = np.nonzero(x_train[i])[0]
        same_old = []
        for l in range(len(ta)):
            if ta[l] in nonz:
                same_old.append(l)

        for l in same_old[::-1]:
            ta.pop(l)
        ta = ta[:k]        
        for j in ta:
            found.append((i, j))
    same = 0
    for i in tqdm(deleted):
        if i in found:
            same += 1
    print(same, len(deleted), same/len(deleted))
    return same/len(deleted)

def visited_at_k(x_train, test_approx, k = 10):
    same_old = []
    for i in tqdm(range(len(test_approx))):
        ta = list(n_argmax(test_approx[i], k))
        nonz = np.nonzero(x_train[i])[0]
        for l in range(len(ta)):
            if ta[l] in nonz:
                same_old.append(l)
    rows,cols = x_train.nonzero()
    return len(same_old)/len(rows)

def n_argmax(a, n):
    ranked = np.argsort(a)
    largest_indices = ranked[::-1][:n]
    return largest_indices

### Ready for ALS

In [6]:
matrix, sparce_matrix, implicit_matrix = ALSReady('./chetor.com/view_matrix/lil_matrix.npz', l=2)

View matrix loaded in 6.775495767593384 seconds.
Number of users which should be deleted: 1863687
Number products which should be deleted: 1308
Number of users which should be deleted: 265
Number products which should be deleted: 4
Number of users which should be deleted: 1
Number products which should be deleted: 1
Number of users which should be deleted: 0
Number products which should be deleted: 0
matrix has been made dense in 17.475834846496582 seconds.


In [7]:
implicit_matrix.shape

(233197, 6877)

In [8]:
# implicit_matrix = implicit_matrix.toarray()

In [9]:
# implicit_matrix = scipy.sparse.csr_matrix(implicit_matrix)

### Running ALS

In [10]:
# item_vectors, user_vectors, alsTime = \
# CFTrain(None, train, _alpha = 10, _facs = 20, _itr = 20)

### metrics

precision@k, recall@k, hit_rate@k, visited_rate@k

In [22]:
x_train, test, deleted = generate_test_set_precision_recall(implicit_matrix, _test_size = 0.07)

test_shape (16324, 6877) train_shape (which we cant process bc of RAM) (216873, 6877)
total cells 39169 number of deleted cells 7833


In [23]:
print(x_train.shape, test.shape, len(deleted), type(test), type(x_train))

(16324, 6877) (16324, 6877) 7833 <class 'scipy.sparse.csr.csr_matrix'> <class 'scipy.sparse.csr.csr_matrix'>


In [24]:
item_vectors, user_vectors, alsTime = \
CFTrain(None, x_train, _alpha = 10, _facs = 20, _itr = 20)

ALS model is fitted in 1.5320508480072021 seconds.


In [25]:
print(item_vectors.shape, user_vectors.shape)
test_approx = np.matmul(user_vectors, item_vectors.T)
print(test_approx.shape, test.shape)

(6877, 20) (16324, 20)
(16324, 6877) (16324, 6877)


--------------------------------------------------------------------------
hit_rate@k

In [28]:
hit_rate_at_k(deleted, x_train.toarray(), test_approx, k = 10)

100%|██████████| 16324/16324 [00:07<00:00, 2160.77it/s]
100%|██████████| 7833/7833 [21:34<00:00,  6.05it/s]

2209 7833 0.28201200051066005





0.28201200051066005

In [29]:
hit_rate_at_k(deleted, x_train.toarray(), test_approx, k = 5)

100%|██████████| 16324/16324 [00:07<00:00, 2148.24it/s]
100%|██████████| 7833/7833 [11:21<00:00, 11.50it/s]

1752 7833 0.22366909230180007





0.22366909230180007

In [30]:
hit_rate_at_k(deleted, x_train.toarray(), test_approx, k = 3)

100%|██████████| 16324/16324 [00:07<00:00, 2242.78it/s]
100%|██████████| 7833/7833 [06:25<00:00, 20.31it/s]

1474 7833 0.1881782203498021





0.1881782203498021

In [31]:
hit_rate_at_k(deleted, x_train.toarray(), test_approx, k = 2)

100%|██████████| 16324/16324 [00:07<00:00, 2252.05it/s]
100%|██████████| 7833/7833 [04:11<00:00, 31.14it/s]

1247 7833 0.1591982637559045





0.1591982637559045

--------------------------------------------------------------------------
visited@k

In [32]:
visited_at_k(x_train.toarray(), test_approx, k = 10)

100%|██████████| 16324/16324 [00:05<00:00, 2734.43it/s]


0.4924049017104927

In [33]:
visited_at_k(x_train.toarray(), test_approx, k = 5)

100%|██████████| 16324/16324 [00:05<00:00, 2838.86it/s]


0.4137094715343375

In [34]:
visited_at_k(x_train.toarray(), test_approx, k = 2)

100%|██████████| 16324/16324 [00:05<00:00, 2903.11it/s]


0.2935601225427623

-------------------------------------------------------------------------------------
Average precision

In [36]:
def prepare_data_for_AP(implicit_matrix, _test_size = 0.05):
    train, test = train_test_split(implicit_matrix, shuffle=False, test_size = _test_size)
    print('test_shape', test.shape, 'train_shape (which we cant process bc of RAM)', train.shape)
    rows, cols = test.nonzero()
    for

In [37]:
prepare_data_for_AP(implicit_matrix, _test_size = 0.05)

test_shape (11660, 6877) train_shape (which we cant process bc of RAM) (221537, 6877)
[0 0 1 1 2 2 3 3 4 4]


In [None]:
def generate_test_set_precision_recall(implicit_matrix, _test_size = 0.05, test_cells = 0.2):
    train, test = train_test_split(implicit_matrix, shuffle=False, test_size = _test_size)
    print('test_shape', test.shape, 'train_shape (which we cant process bc of RAM)', train.shape)
    rows, cols = test.nonzero()
    delete_index = [(rows[i], cols[i]) for i in random.sample(range(1, len(rows)), int(len(rows)*test_cells))]
    print("total cells", len(rows), "number of deleted cells", len(delete_index))
    
    x_train = scipy.sparse.csr_matrix(test.shape)
    for i, j in zip(rows, cols):
        if (i, j) in delete_index:
            continue
        x_train[i, j] = test[i, j]
    return x_train, test, delete_index

-------------------------------------------------------------------------------------
MAE per iteration

In [4]:
matrix, sparce_matrix, implicit_matrix = ALSReady('./chetor.com/view_matrix/lil_matrix.npz', l=2)

View matrix loaded in 7.785526514053345 seconds.
Number of users which should be deleted: 1863687
Number products which should be deleted: 1308
Number of users which should be deleted: 265
Number products which should be deleted: 4
Number of users which should be deleted: 1
Number products which should be deleted: 1
Number of users which should be deleted: 0
Number products which should be deleted: 0
matrix has been made dense in 20.18146324157715 seconds.


In [6]:
train, test = train_test_split(implicit_matrix, shuffle=False)

In [7]:
print(test.shape)

(58300, 6877)


In [14]:
def itr_MAE(test_set, itr):
    item_vectors, user_vectors, alsTime = \
    CFTrain(None, test_set, _alpha = 10, _facs = 20, _itr = itr)
    approximated_matrix = np.matmul(user_vectors, item_vectors.T)
    absolute_error1 = np.absolute(test[:15000] - approximated_matrix[:15000])
    absolute_error2 = np.absolute(test[15000:30000] - approximated_matrix[15000:30000])
    absolute_error3 = np.absolute(test[30000:45000] - approximated_matrix[30000:45000])
    absolute_error4 = np.absolute(test[45000:] - approximated_matrix[45000:])
    return (np.mean(absolute_error1)+np.mean(absolute_error2)+np.mean(absolute_error3)+np.mean(absolute_error4))/4

In [None]:
MAEs = []
for i in range(5):
    MAEs.append(itr_MAE(test, i))
    print(MAEs[-1])

ALS model is fitted in 0.08308863639831543 seconds.


In [9]:
item_vectors, user_vectors, alsTime = \
CFTrain(None, test, _alpha = 10, _facs = 20, _itr = 1)

ALS model is fitted in 0.3134288787841797 seconds.


In [None]:
approximated_matrix = np.matmul(user_vectors, item_vectors.T)
absolute_error1 = np.absolute(test[:15000] - approximated_matrix[:15000])
absolute_error2 = np.absolute(test[15000:30000] - approximated_matrix[15000:30000])
absolute_error3 = np.absolute(test[30000:45000] - approximated_matrix[30000:45000])
absolute_error4 = np.absolute(test[45000:] - approximated_matrix[45000:])
(np.mean(absolute_error1)+np.mean(absolute_error2)+np.mean(absolute_error3)+np.mean(absolute_error4))/4