In [39]:
import os
import sys

import time
from typing import Dict, TypeVar, Set, Optional, Any
from itertools import chain

import numpy as np
import pandas as pd
from implicit.als import AlternatingLeastSquares
from loguru import logger
from scipy.sparse import csr_matrix
from tqdm.auto import tqdm

In [325]:
pd.options.display.max_rows = 300

In [11]:
BASE_DIR = "/Users/artemvopilov/Programming/yandex_cup_2023"

In [255]:
DATA_DIR = f"{BASE_DIR}/data"

TRAIN_DF_PATH = f"{DATA_DIR}/train.csv"
TEST_DF_PATH = f"{DATA_DIR}/test.csv"

NORMED_EMBEDDINGS_DIR = f"{BASE_DIR}/normed_embeddings"
PCA_EMBEDDINGS_DIR = f"{BASE_DIR}/pca_embeddings"
VAE_EMBEDDINGS_DIR = f"{BASE_DIR}/vae_embeddings"
NORMED_LSTM_EMBEDDINGS_DIR = f"{BASE_DIR}/normed_lstm_embeddings"
VAE_LSTM_EMBEDDINGS_DIR = f"{BASE_DIR}/vae_lstm_embeddings"

### Read data

In [278]:
train_df = pd.read_csv(TRAIN_DF_PATH)
test_df = pd.read_csv(TEST_DF_PATH)

In [279]:
track_id_to_embedding = {}
for fn in tqdm(os.listdir(PCA_EMBEDDINGS_DIR)):
    fp = f"{PCA_EMBEDDINGS_DIR}/{fn}"

    track_id = fn.split('.')[0]
    embedding = np.mean(np.load(fp).astype(np.float32), axis=0)
    # embedding = np.load(fp).astype(np.float32)[-1]
    track_id_to_embedding[int(track_id)] = embedding

  0%|          | 0/76714 [00:00<?, ?it/s]

### Als trainer

In [101]:
T_id_type = TypeVar('T_id_type', int, str)


class AlsTrainer:
    _params: Dict[str, Any]
    _init_user_embeddings: Optional[Dict]
    _init_item_embeddings: Optional[Dict]
    _als: AlternatingLeastSquares
    _buyeruid_to_index: Dict[int, int]
    _item_id_to_index: Dict[str, int]

    def __init__(self, params: Dict[str, Any], init_user_embeds: Optional[Dict], init_item_embeds: Optional[Dict]) -> None:
        self._params = params
        self._init_user_embeddings = init_user_embeds
        self._init_item_embeddings = init_item_embeds
        self.reset()

    @property
    def params(self) -> Dict[str, Any]:
        return self._params

    @property
    def init_user_embeddings(self) -> Optional[Dict]:
        return self._init_user_embeddings

    @property
    def init_item_embeddings(self) -> Optional[Dict]:
        return self._init_item_embeddings

    def reset(self) -> None:
        self._reset()
        if self.is_user_step():
            self._set_item_embeddings()
        elif self.is_item_step():
            self._set_user_embeddings()

    def train(self, dataset: pd.DataFrame) -> None:
        if self.is_user_step():
            self.user_step(dataset)
        elif self.is_item_step():
            self.item_step(dataset)
        else:
            self.full_train(dataset)

    def full_train(self, dataset: pd.DataFrame) -> None:
        start_time = time.time()
        train_buyeruid_to_index = self._index(set(dataset['buyeruid']), self._buyeruid_to_index)
        train_item_id_to_index = self._index(set(dataset['item_id']), self._item_id_to_index)
        self._buyeruid_to_index = {**self._buyeruid_to_index, **train_buyeruid_to_index}
        self._item_id_to_index = {**self._item_id_to_index, **train_item_id_to_index}
        user_item_matrix = self._build_interactions_matrix(dataset, self._buyeruid_to_index, self._item_id_to_index)
        self._als.fit(user_item_matrix)
        logger.info(f'Training took {(time.time() - start_time) / 60} minutes, '
                    f'users: {len(self.get_buyeruids())}, items: {len(self.get_item_ids())}')

    def user_step(self, dataset: pd.DataFrame) -> None:
        start_time = time.time()
        step_buyeruid_to_index = self._index(set(dataset['buyeruid']), self._buyeruid_to_index)
        self._buyeruid_to_index = {**self._buyeruid_to_index, **step_buyeruid_to_index}

        if self._als.user_factors is None:
            self._als.user_factors = np.zeros((len(step_buyeruid_to_index), self._als.factors), dtype=self._als.dtype)
        step_user_item_matrix = self._build_interactions_matrix(dataset, self._buyeruid_to_index, self._item_id_to_index)
        step_user_indices = list(step_buyeruid_to_index.values())
        self._als.partial_fit_users(step_user_indices, step_user_item_matrix[step_user_indices])
        self._als._XtX = None
        logger.info(f'User step took {(time.time() - start_time) / 60} minutes, '
                    f'users: {len(self.get_buyeruids())}, items: {len(self.get_item_ids())}')

    def item_step(self, dataset: pd.DataFrame) -> None:
        start_time = time.time()
        step_item_id_to_index = self._index(set(dataset['item_id']), self._item_id_to_index)
        self._item_id_to_index = {**self._item_id_to_index, **step_item_id_to_index}
        if self._als.item_factors is None:
            self._als.item_factors = np.zeros((len(step_item_id_to_index), self._als.factors), dtype=self._als.dtype)
        step_item_user_matrix = self._build_interactions_matrix(dataset, self._buyeruid_to_index, self._item_id_to_index).T.tocsr()
        step_item_indices = list(step_item_id_to_index.values())
        self._als.partial_fit_items(step_item_indices, step_item_user_matrix[step_item_indices])
        self._als._YtY = None
        logger.info(f'Item step took {(time.time() - start_time) / 60} minutes, '
                    f'users: {len(self.get_buyeruids())}, items: {len(self.get_item_ids())}')


    def user_inference(self, dataset: pd.DataFrame) -> Dict[int, np.ndarray]:
        start_time = time.time()
        buyeruid_to_index = self._index(set(dataset['buyeruid']))
        user_item_matrix = self._build_interactions_matrix(dataset, buyeruid_to_index, self._item_id_to_index)
        buyeruids = list(buyeruid_to_index.keys())
        user_indices = [buyeruid_to_index[buyeruid] for buyeruid in buyeruids]
        user_embeddings = self._als.recalculate_user(user_indices, user_item_matrix[user_indices])
        buyeruid_to_embedding = dict(zip(buyeruids, user_embeddings.tolist()))
        logger.info(f'Inference took {(time.time() - start_time) / 60} minutes, '
                    f'users: {len(buyeruid_to_embedding)}')
        return buyeruid_to_embedding

    def is_user_step(self) -> bool:
        return self._init_item_embeddings is not None

    def is_item_step(self) -> bool:
        return self._init_user_embeddings is not None

    def get_buyeruids(self) -> Set[int]:
        return set(self._buyeruid_to_index.keys())

    def get_item_ids(self) -> Set[str]:
        return set(self._item_id_to_index.keys())

    def get_user_embeddings(self, buyeruids: Set[int] = None) -> Dict[int, np.ndarray]:
        buyeruids = buyeruids & self.get_buyeruids() if buyeruids is not None else self.get_buyeruids()
        return {buyeruid: self.get_user_embedding(buyeruid) for buyeruid in buyeruids}

    def get_user_embedding(self, buyeruid: int) -> np.ndarray:
        return self._als.user_factors[self._buyeruid_to_index[buyeruid]]

    def get_item_embeddings(self, item_ids: Set[str] = None) -> Dict[str, np.ndarray]:
        item_ids = item_ids & self.get_item_ids() if item_ids is not None else self.get_item_ids()
        return {item_id: self.get_item_embedding(item_id) for item_id in item_ids}

    def get_item_embedding(self, item_id: str) -> np.ndarray:
        return self._als.item_factors[self._item_id_to_index[item_id]]

    def _reset(self) -> None:
        self._als = AlternatingLeastSquares(**self._params)
        self._buyeruid_to_index = {}
        self._item_id_to_index = {}
        logger.info('Als reset')

    def _set_user_embeddings(self) -> None:
        self._buyeruid_to_index = self._index(set(self._init_user_embeddings.keys()))
        user_embeddings = self._build_embeddings_matrix(self._buyeruid_to_index, self._init_user_embeddings)
        self._als.user_factors = user_embeddings
        logger.info('User embeddings set')

    def _set_item_embeddings(self) -> None:
        self._item_id_to_index = self._index(set(self._init_item_embeddings.keys()))
        item_embeddings = self._build_embeddings_matrix(self._item_id_to_index, self._init_item_embeddings)
        self._als.item_factors = item_embeddings
        logger.info('Item embeddings set')

    def _build_embeddings_matrix(
            self,
            id_to_index: Dict[T_id_type, int],
            id_to_embeddings: Dict
    ) -> np.ndarray:
        embeddings = np.zeros((len(id_to_index), self._als.factors), dtype=self._als.dtype)
        for id_, index in tqdm(id_to_index.items()):
            embedding = np.array(id_to_embeddings[id_], dtype=self._als.dtype)
            embeddings[index, :] = embedding
        return embeddings

    @staticmethod
    def _build_interactions_matrix(
            dataset: pd.DataFrame,
            buyeruid_to_index: Dict[int, int],
            item_id_to_index: Dict[str, int]
    ) -> csr_matrix:
        targets = dataset['target']
        users_indices = dataset['buyeruid'].map(buyeruid_to_index).tolist()
        item_indices = dataset['item_id'].map(item_id_to_index).tolist()
        return csr_matrix((targets, (users_indices, item_indices)))

    @staticmethod
    def _index(ids: Set[T_id_type], id_to_index: Dict[T_id_type, int] = None) -> Dict[T_id_type, int]:
        if id_to_index is None:
            id_to_index = {}
        new_id_to_index = {}
        for id_ in ids:
            index = id_to_index.get(id_, len(id_to_index) + len(new_id_to_index))
            new_id_to_index[id_] = index
        return new_id_to_index

### Dataset

In [24]:
class AlsTargetLimits:
    DEFAULT_MIN: int = 0
    DEFAULT_MAX: int = sys.maxsize

    neg_min: int
    neg_max: int
    pos_min: int
    pos_max: int

    def __init__(
            self,
            neg_min: int = DEFAULT_MIN,
            neg_max: int = DEFAULT_MAX,
            pos_min: int = DEFAULT_MIN,
            pos_max: int = DEFAULT_MAX
    ) -> None:
        self.neg_min = neg_min
        self.neg_max = neg_max
        self.pos_min = pos_min
        self.pos_max = pos_max

In [25]:
class AlsDatasetParams:
    target_implicit: bool
    target_weighted: bool
    user_target_limits: AlsTargetLimits
    item_target_limits: AlsTargetLimits
    inference_target_limits: AlsTargetLimits

    def __init__(
            self,
            target_implicit: bool,
            target_weighted: bool,
            user_target_limits: AlsTargetLimits,
            item_target_limits: AlsTargetLimits,
            inference_target_limits: AlsTargetLimits
    ) -> None:
        self.target_implicit = target_implicit
        self.target_weighted = target_weighted
        self.user_target_limits = user_target_limits
        self.item_target_limits = item_target_limits
        self.inference_target_limits = inference_target_limits            

In [27]:
class AlsDatasetBuilder:
    _params: AlsDatasetParams

    def __init__(self, params: AlsDatasetParams) -> None:
        self._params = params

    def build_train(self, dataset: pd.DataFrame, als_trainer: AlsTrainer) -> pd.DataFrame:
        if als_trainer.is_user_step():
            return self.build_user_step(dataset, als_trainer.get_item_ids())
        elif als_trainer.is_item_step():
            return self.build_item_step(dataset, als_trainer.get_buyeruids())
        else:
            return self.build_full_train(dataset)

    def build_full_train(self, dataset: pd.DataFrame) -> pd.DataFrame:
        als_dataset = self._extract_user_item_targets(dataset)
        als_dataset = self._filter_by_target_limits(als_dataset, 'item_id', self._params.item_target_limits)
        als_dataset = self._filter_by_target_limits(als_dataset, 'buyeruid', self._params.user_target_limits)
        logger.info(f'Built als train dataset, size: {len(als_dataset)}')
        return als_dataset

    def build_user_step(self, dataset: pd.DataFrame, item_ids: Set[str]) -> pd.DataFrame:
        als_dataset = self._extract_user_item_targets(dataset)
        als_dataset = self._filter_by_ids(als_dataset, 'item_id', item_ids)
        als_dataset = self._filter_by_target_limits(als_dataset, 'buyeruid', self._params.user_target_limits)
        logger.info(f'Built user step dataset, size: {len(als_dataset)}')
        return als_dataset

    def build_item_step(self, dataset: pd.DataFrame, buyeruids: Set[int]) -> pd.DataFrame:
        als_dataset = self._extract_user_item_targets(dataset)
        als_dataset = self._filter_by_ids(als_dataset, 'buyeruid', buyeruids)
        als_dataset = self._filter_by_target_limits(als_dataset, 'item_id', self._params.item_target_limits)
        logger.info(f'Built item step dataset, size: {len(als_dataset)}')
        return als_dataset

    def build_inference(self, dataset: pd.DataFrame, item_ids: Set[str], buyeruids: Set[int] = None) -> pd.DataFrame:
        als_dataset = self._extract_user_item_targets(dataset)
        als_dataset = self._filter_by_ids(als_dataset, 'item_id', item_ids)
        if buyeruids:
            als_dataset = self._filter_by_ids(als_dataset, 'buyeruid', buyeruids)
        als_dataset = self._filter_by_target_limits(als_dataset, 'buyeruid', self._params.inference_target_limits)
        logger.info(f'Built inference dataset, size: {len(als_dataset)}')
        return als_dataset

    def _extract_user_item_targets(self, dataset: pd.DataFrame) -> pd.DataFrame:
        if not self._params.target_implicit:
            dataset = dataset.drop(dataset[dataset['target'] == 0].index)
        user_item_targets = dataset.groupby(['buyeruid', 'item_id'], as_index=False).agg({'target': 'sum'})
        if not self._params.target_implicit:
            user_item_targets.drop(user_item_targets[user_item_targets['target'] == 0].index, inplace=True)
        else:
            user_item_targets['target'] = user_item_targets['target'].map(lambda t: -1 if t == 0 else t)
        if not self._params.target_weighted:
            user_item_targets['target'] = np.sign(user_item_targets['target'])
        return user_item_targets

    def _filter_by_target_limits(self, dataset: pd.DataFrame, id_name: str, limits: AlsTargetLimits) -> pd.DataFrame:
        target_counts = self._calculate_target_counts(dataset, id_name)
        filtered_ids = target_counts[(
                (target_counts['negative_count'] >= limits.neg_min)
                & (target_counts['negative_count'] <= limits.neg_max)
                & (target_counts['positive_count'] >= limits.pos_min)
                & (target_counts['positive_count'] <= limits.pos_max))].index
        return dataset[dataset[id_name].isin(filtered_ids)]

    @staticmethod
    def _filter_by_ids(dataset: pd.DataFrame, id_name: str, ids: Set[T_id_type]) -> pd.DataFrame:
        return dataset[dataset[id_name].isin(ids)]

    @staticmethod
    def _calculate_target_counts(dataset: pd.DataFrame, id_name: str) -> pd.DataFrame:
        dataset = dataset.assign(target_is_positive=dataset['target'] > 0)
        target_counts = dataset.groupby(id_name).agg(
            positive_count=('target_is_positive', 'sum'),
            count=('target_is_positive', 'count'))
        target_counts['negative_count'] = target_counts['count'] - target_counts['positive_count']
        return target_counts

### Prepare data

In [38]:
train_df.head()

Unnamed: 0,track,tags
0,49734,56926325596
1,67845,692839145155
2,25302,62840116168
3,57796,28186
4,13676,623177


In [157]:
tags = [list(map(int, t.split(','))) for t in train_df['tags'].values]
tags

[[5, 6, 9, 26, 32, 55, 96],
 [6, 9, 28, 39, 145, 155],
 [0, 6, 28, 40, 116, 168],
 [28, 186],
 [6, 23, 177],
 [43, 183, 252],
 [0, 10, 48],
 [35, 112, 191],
 [0, 16],
 [6, 9, 32, 85, 122],
 [6, 145, 187, 241],
 [0, 8, 40, 248],
 [0, 1, 8, 12, 13],
 [0, 7, 8, 38, 80],
 [6, 145, 241],
 [1, 5, 15, 35, 64, 70, 99, 165],
 [1, 5, 104, 172],
 [0, 2, 8, 32, 51],
 [1, 15, 25, 71, 92, 99],
 [0, 8, 30, 51],
 [3, 35, 55, 73, 112, 146, 198],
 [6, 122],
 [0, 1, 2, 8, 128],
 [6, 145, 170],
 [6, 215],
 [0, 80, 100, 156],
 [0, 4, 7, 16, 88],
 [0, 2, 8, 9, 24, 40, 141],
 [0, 4, 7, 8],
 [0, 7, 57],
 [6, 215],
 [6, 158],
 [2, 6, 9, 26, 32, 47, 103, 117, 151],
 [0, 28, 182],
 [0, 5, 8, 10, 80],
 [0, 2, 8, 51],
 [9, 45, 47],
 [1, 5, 45, 75, 119],
 [9, 28, 35, 112],
 [6, 9, 81, 117, 175, 187, 241],
 [0, 1, 43, 98, 225],
 [0, 57, 63, 142],
 [0, 10, 12, 97, 100, 132],
 [35, 143, 244],
 [1, 2, 13, 196],
 [0, 10, 73],
 [0, 3, 22, 31, 40, 84],
 [9, 32, 85, 155],
 [9, 47],
 [9, 23, 134, 169],
 [5, 89, 104, 164],
 

In [158]:
unique_tags = sorted(list(set(chain.from_iterable(tags))))
unique_tags

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [159]:
len(unique_tags)

256

In [160]:
tracks = sorted(train_df['track'])
tracks

[0,
 2,
 3,
 4,
 5,
 6,
 8,
 11,
 12,
 13,
 14,
 17,
 18,
 19,
 21,
 22,
 25,
 26,
 27,
 28,
 29,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 58,
 59,
 62,
 63,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 73,
 74,
 75,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 85,
 87,
 89,
 94,
 96,
 97,
 99,
 102,
 103,
 104,
 105,
 108,
 109,
 110,
 112,
 114,
 115,
 118,
 119,
 120,
 122,
 125,
 126,
 128,
 129,
 131,
 132,
 133,
 136,
 139,
 140,
 144,
 147,
 149,
 150,
 152,
 154,
 155,
 156,
 157,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 167,
 170,
 172,
 173,
 174,
 176,
 177,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 195,
 196,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 214,
 215,
 220,
 221,
 222,
 224,
 228,
 229,
 230,
 233,
 234,
 235,
 236,
 237,
 239,
 242,
 245,
 246,
 248,
 249,
 250,
 252,
 254,
 256,
 258,
 260,
 261,
 262,
 264,
 266,
 268,
 271,
 273,
 274,
 275,
 276,
 277,
 278,
 279,
 280,
 281,


In [161]:
len(tracks)

51134

In [162]:
als_events = pd.DataFrame({
    'buyeruid': np.repeat(tracks, len(unique_tags)),
    'item_id': np.tile(unique_tags, len(tracks))
})

In [163]:
als_events.head()

Unnamed: 0,buyeruid,item_id
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4


In [164]:
track_to_tags = {tr: set(map(int, t.split(','))) for tr, t in zip(train_df['track'].values, train_df['tags'].values)}
track_to_tags

{49734: {5, 6, 9, 26, 32, 55, 96},
 67845: {6, 9, 28, 39, 145, 155},
 25302: {0, 6, 28, 40, 116, 168},
 57796: {28, 186},
 13676: {6, 23, 177},
 29968: {43, 183, 252},
 38652: {0, 10, 48},
 23887: {35, 112, 191},
 44661: {0, 16},
 26449: {6, 9, 32, 85, 122},
 16511: {6, 145, 187, 241},
 32609: {0, 8, 40, 248},
 43932: {0, 1, 8, 12, 13},
 13941: {0, 7, 8, 38, 80},
 20065: {6, 145, 241},
 53370: {1, 5, 15, 35, 64, 70, 99, 165},
 62174: {1, 5, 104, 172},
 52322: {0, 2, 8, 32, 51},
 41853: {1, 15, 25, 71, 92, 99},
 58614: {0, 8, 30, 51},
 22115: {3, 35, 55, 73, 112, 146, 198},
 34257: {6, 122},
 63054: {0, 1, 2, 8, 128},
 850: {6, 145, 170},
 2980: {6, 215},
 31505: {0, 80, 100, 156},
 27354: {0, 4, 7, 16, 88},
 38840: {0, 2, 8, 9, 24, 40, 141},
 71885: {0, 4, 7, 8},
 7290: {0, 7, 57},
 5201: {6, 215},
 22749: {6, 158},
 23811: {2, 6, 9, 26, 32, 47, 103, 117, 151},
 2248: {0, 28, 182},
 57495: {0, 5, 8, 10, 80},
 36136: {0, 2, 8, 51},
 45537: {9, 45, 47},
 42745: {1, 5, 45, 75, 119},
 1621

In [165]:
%%time

targets = als_events.apply(lambda row: 1 if row['item_id'] in track_to_tags[row['buyeruid']] else -1, axis=1).astype(int)

CPU times: user 2min 4s, sys: 2.51 s, total: 2min 7s
Wall time: 2min 8s


In [166]:
targets

0          -1
1           1
2          -1
3          -1
4          -1
           ..
13090299   -1
13090300   -1
13090301   -1
13090302   -1
13090303   -1
Length: 13090304, dtype: int64

In [167]:
als_events['target'] = targets

In [168]:
als_events.shape

(13090304, 3)

In [169]:
als_events.head()

Unnamed: 0,buyeruid,item_id,target
0,0,0,-1
1,0,1,1
2,0,2,-1
3,0,3,-1
4,0,4,-1


In [170]:
als_events.dtypes

buyeruid    int64
item_id     int64
target      int64
dtype: object

In [171]:
als_events = als_events.astype({'item_id': str})

In [172]:
als_events.dtypes

buyeruid     int64
item_id     object
target       int64
dtype: object

### Train

In [259]:
%env OPENBLAS_NUM_THREADS 1

env: OPENBLAS_NUM_THREADS=1


In [299]:
als_params = {
    'factors': 32,
    'regularization': 0.1,
    'alpha': 1.0,
    'iterations': 1
}

In [300]:
als_trainer = AlsTrainer(params=als_params, init_user_embeds=track_id_to_embedding, init_item_embeds=None)

[32m2023-11-12 13:14:54.885[0m | [1mINFO    [0m | [36m__main__[0m:[36m_reset[0m:[36m126[0m - [1mAls reset[0m


  0%|          | 0/76714 [00:00<?, ?it/s]

[32m2023-11-12 13:14:55.143[0m | [1mINFO    [0m | [36m__main__[0m:[36m_set_user_embeddings[0m:[36m132[0m - [1mUser embeddings set[0m


In [301]:
als_dataset_params = AlsDatasetParams(
    target_implicit=False, 
    target_weighted=False,
    user_target_limits=AlsTargetLimits(),
    item_target_limits=AlsTargetLimits(),
    inference_target_limits=AlsTargetLimits()
)

In [302]:
als_dataset_builder = AlsDatasetBuilder(params=als_dataset_params)

In [303]:
als_dataset = als_dataset_builder.build_train(als_events, als_trainer)

[32m2023-11-12 13:15:02.943[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_item_step[0m:[36m33[0m - [1mBuilt item step dataset, size: 13090304[0m


In [304]:
als_dataset.head()

Unnamed: 0,buyeruid,item_id,target
0,0,0,-1
1,0,1,1
2,0,10,-1
3,0,100,-1
4,0,101,-1


In [305]:
als_dataset.shape

(13090304, 3)

In [306]:
als_trainer.train(als_dataset)

[32m2023-11-12 13:15:11.689[0m | [1mINFO    [0m | [36m__main__[0m:[36mitem_step[0m:[36m80[0m - [1mItem step took 0.14512253204981487 minutes, users: 76714, items: 256[0m


### Inference

In [307]:
len(track_id_to_embedding)

76714

In [308]:
all_tracks = list(track_id_to_embedding.keys())

In [309]:
tag_embeddings = als_trainer.get_item_embeddings()
len(tag_embeddings)

256

In [310]:
track_embeddings = np.array([track_id_to_embedding[t] for t in all_tracks])
tag_embeddings = np.array([tag_embeddings[str(t)] for t in unique_tags])

In [311]:
predictions = track_embeddings.dot(tag_embeddings.T)

In [312]:
predictions.shape

(76714, 256)

### Save predictions

In [313]:
predictions_df = pd.DataFrame([
    {'track': track, 'prediction': ','.join([str(p) for p in probs])}
    for track, probs in 
    zip(all_tracks, predictions)
])

In [314]:
predictions_df.head()

Unnamed: 0,track,prediction
0,531,"0.08433442,0.1453033,-0.03661689,-0.06968812,0..."
1,33632,"-0.085686214,0.15694469,-0.02607475,1.1257827e..."
2,75667,"-0.24536896,-0.021887776,0.013294909,-0.009623..."
3,65474,"0.3186001,-0.042323507,-0.031489726,-0.0449816..."
4,23421,"-0.30761883,0.38376418,-0.07070528,-0.07843117..."


In [315]:
predictions_df.shape

(76714, 2)

In [316]:
predictions_df.to_csv('prediction_pca_als_2.csv', index=False)

In [254]:
# predictions_df.to_csv('prediction_vae_als.csv', index=False)

In [277]:
# predictions_df.to_csv('prediction_normder_lstm_als.csv', index=False)