In [10]:
from typing import Dict

import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

class UserItemMatrix:
    def __init__(self, sales_data: pd.DataFrame):
        """Class initialization. You can make necessary
        calculations here.

        Args:
            sales_data (pd.DataFrame): Sales dataset.

        Example:
            sales_data (pd.DataFrame):

                user_id  item_id  qty   price
            0        1      118    1   626.66
            1        1      285    1  1016.57
            2        2     1229    3   518.99
            3        4     1688    2   940.84
            4        5     2068    1   571.36
            ...

        """
        self._sales_data = sales_data.copy()

        self._user_count = []
        self._item_count = []

        self._user_map = []
        self._item_map = []

        self._matrix = []

    @property
    def user_count(self) -> int:
        """
        Returns:
            int: the number of users in sales_data.
        """
        self._user_count = len(self._sales_data['user_id'].unique())

        return self._user_count

    @property
    def item_count(self) -> int:
        """
        Returns:
            int: the number of items in sales_data.
        """
        self._item_count = len(self._sales_data['item_id'].unique())

        return self._item_count

    @property
    def user_map(self) -> Dict[int, int]:
        """Creates a mapping from user_id to matrix rows indexes.

        Example:
            sales_data (pd.DataFrame):

                user_id  item_id  qty   price
            0        1      118    1   626.66
            1        1      285    1  1016.57
            2        2     1229    3   518.99
            3        4     1688    2   940.84
            4        5     2068    1   571.36

            user_map (Dict[int, int]):
                {1: 0, 2: 1, 4: 2, 5: 3}
                
        Returns:
            Dict[int, int]: User map
        """

        self._user_map = dict(zip(list(self._sales_data['user_id'].unique()), list(
            self._sales_data.index)))

        return self._user_map

    @property
    def item_map(self) -> Dict[int, int]:
        """Creates a mapping from item_id to matrix rows indexes.

        Example:
            sales_data (pd.DataFrame):

                user_id  item_id  qty   price
            0        1      118    1   626.66
            1        1      285    1  1016.57
            2        2     1229    3   518.99
            3        4     1688    2   940.84
            4        5     2068    1   571.36

            item_map (Dict[int, int]):
                {118: 0, 285: 1, 1229: 2, 1688: 3, 2068: 4}

        Returns:
            Dict[int, int]: Item map
        """
        self._item_map = dict(zip(list(self._sales_data.sort_values(['item_id'])['item_id'].unique()), 
            list(self._sales_data.index)))

        return self._item_map

    @property
    def csr_matrix(self) -> csr_matrix:
        """User items matrix in form of CSR matrix.

        User row_ind, col_ind as
        rows and cols indecies(mapped from user/item map).

        Returns:
            csr_matrix: CSR matrix
        """

        # Преобразование словарей в списки индексов

        self._sales_data["user_id"] = self._sales_data["user_id"].map(
            self.user_map)
        self._sales_data["item_id"] = self._sales_data["item_id"].map(
            self.item_map)
        self._matrix = csr_matrix((self._sales_data['qty'].values, 
                        (self._sales_data["user_id"].values, self._sales_data["item_id"].values)))

        return self._matrix


In [12]:
next(zip([1,2],[1,2]))

In [23]:
list(dict(zip([1,2],[1,2])).keys())

[1, 2]

In [70]:
a = np.array([0,1,2,3,4,5,6,7,8])

In [72]:
a.reshape(3,3)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [83]:
n = max(a), max(a) +1

In [85]:
len(a)

9

In [3]:
val = {1: 0, 0:0, 2: 1, 4: 2, 5: 3}

In [4]:
a = csr_matrix((np.array([3,4, 4,5,6]), 
                        (np.array(list(val.values())), np.array(list(val.keys())))),
         )

In [97]:
csr_matrix((np.array([3,4,5,6,7,8,9,9,9]), 
                        (np.array([0,0,1,2,3,5,5,6,7]), np.array([0,0,1,2,3,5,5,6,7]))),
                                  shape=(9, 9)))

  (0, 0)	7
  (1, 1)	5
  (2, 2)	6
  (3, 3)	7
  (5, 5)	17
  (6, 6)	9
  (7, 7)	9


In [16]:
from scipy.sparse import csr_matrix
import numpy as np


class Normalization:
    '''docstring'''
    @staticmethod
    def by_column(matrix: csr_matrix) -> csr_matrix:
        """Normalization b1_y column

        Args:
            matrix (csr_matrix): User-Item matrix of size (N, M)

        Returns:
            csr_matrix: Normalized matrix of size (N, M)
        """

        norm_matrix = csr_matrix(matrix.multiply(1/matrix.sum(0)))
        return norm_matrix

    @staticmethod
    def by_row(matrix: csr_matrix) -> csr_matrix:
        """Normalization b1_y row

        Args:
            matrix (csr_matrix): User-Item matrix of size (N, M)

        Returns:
            csr_matrix: Normalized matrix of size (N, M)
        """
        norm_matrix = csr_matrix(matrix.multiply(1/matrix.sum(1)))
        return norm_matrix

    @staticmethod
    def tf_idf(matrix: csr_matrix) -> csr_matrix:
        """Normalization using tf_-idf

        Args:
            matrix (csr_matrix): User-Item matrix of size (N, M)

        Returns:
            csr_matrix: Normalized matrix of size (N, M)
        """
        tf_ = csr_matrix(matrix.multiply(1/matrix.sum(1)))
        idf = csr_matrix(matrix.get_shape()[
                         0] / (matrix > 0).sum(0) - 1).log1p()
        norm_matrix = tf_.multiply(idf)

        return norm_matrix

    @staticmethod
    def bm_25(
        matrix: csr_matrix, k1_: float = 2.0, b1_: float = 0.75
    ) -> csr_matrix:
        """Normalization b1_ased on b1_M-25

        Args:
            matrix (csr_matrix): User-Item matrix of size (N, M)

        Returns:
            csr_matrix: Normalized matrix of size (N, M)
        """
        length_d = matrix.sum(1)
        tf_ = matrix.multiply(1 / length_d)
        idf = csr_matrix(
            (matrix.get_shape()[0]) / (matrix > 0).sum(0) - 1).log1p()
        avd_d = np.mean(length_d)

        delta = k1_*((1 - b1_) + b1_ * (length_d/avd_d))
        tf__inv = tf_.multiply(1 / delta).power(-1)
        tf__inv.data += 1
        norm_matrix = tf__inv.power(-1) * (k1_+1)
        norm_matrix = norm_matrix.multiply(idf)

        return norm_matrix


In [88]:
print(Normalization().by_row(a))

  (0, 0)	0.5714285714285714
  (0, 1)	0.42857142857142855
  (1, 2)	1.0
  (2, 4)	1.0
  (3, 5)	1.0


In [89]:
print(Normalization().by_column(a))

  (0, 0)	1.0
  (0, 1)	1.0
  (1, 2)	1.0
  (2, 4)	1.0
  (3, 5)	1.0


  norm_matrix = csr_matrix(matrix.multiply(1/matrix.sum(0)))


In [67]:
print(Normalization().tf_idf(a))

  (0, 1)	0.5941261547656673
  (0, 0)	0.7921682063542231
  (1, 2)	1.3862943611198906
  (2, 4)	1.3862943611198906
  (3, 5)	1.3862943611198906


  idf = csr_matrix(matrix.get_shape()[0] / (matrix > 0).sum(0) -1).log1p()


In [68]:
%%time
print(Normalization().bm_25(a))

  (0, 1)	0.6281150652213693
  (0, 0)	0.7973457763303945
  (1, 2)	1.6051829444546102
  (2, 4)	1.452308378316076
  (3, 5)	1.3260206932451128
CPU times: user 9.18 ms, sys: 92 µs, total: 9.27 ms
Wall time: 8.45 ms


  idf = csr_matrix((matrix.get_shape()[0]) / (matrix > 0).sum(0) -1).log1p()


In [51]:
data = pd.read_csv('data.csv')

In [52]:
data

Unnamed: 0,user_id,item_id,qty,price
0,1,12224,1,1183.19
1,1,14816,1,688.33
2,1,17872,1,696.20
3,1,23481,1,1934.48
4,2,1197,3,970.94
...,...,...,...,...
1209327,39860,19476,1,794.56
1209328,39860,20481,1,835.51
1209329,39860,26729,1,630.62
1209330,39860,27243,1,483.45


In [53]:
data = Normalization().bm_25(UserItemMatrix(data).csr_matrix)

In [54]:
data

<38114x27757 sparse matrix of type '<class 'numpy.float64'>'
	with 1209332 stored elements in Compressed Sparse Row format>

In [61]:
U.shape

(38114, 17)

In [63]:
import numpy as np
from scipy.sparse import csr_matrix
import pickle
from scipy import sparse, linalg, stats
from scipy.sparse.linalg import svds, aslinearoperator, LinearOperator


def items_embeddings(ui_matrix: csr_matrix, dim: int) -> np.ndarray:
    """Build items embedding using factorization model.
    The order of items should be the same in the output matrix.

    Args:
        ui_matrix (pd.DataFrame): User-Item matrix of size (N, M)
        dim (int): Dimention of embedding vectors

    Returns:
        np.ndarray: Items embeddings matrix of size (M, dim)
    """
    items_vec, S, V = svds(ui_matrix, k = dim)
    
    
    return items_vec


In [None]:
items_embeddings(data, 10)

In [45]:
with open('test.pkl','wb') as f:
    pickle.dump(items_embeddings(data, data.shape[0]), f)
