In [21]:
import os, sys, tempfile, logging, time
import csv
from datetime import date
from dscollaborative.recommender import UserItemRatingMatrix, ImplicitModel, Features, Cleaner, Duplicates
from dscollaborative.recommender import reranker_x_and_y, reranker_x_only
from dstools.utils import normalize_path, save_list_to_file, file_to_list
import json
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import sparse
import scipy
import pickle
import random
import time

from sqlalchemy import create_engine

prod_engine = create_engine(
    'postgresql://reco_etl:recoreco@10.232.201.241:5432/unext_analytics_dw')

logging.basicConfig(level=logging.INFO)

In [10]:
with open("kids_character_mapping.pkl", "rb") as fp:   #Pickling
    character_sids_dict = pickle.load(fp)
character_sids_dict

{'12歳。〜ちっちゃなムネのトキメキ〜': ['SID0036686', 'SID0036687'],
 'B-伝説！バトルビーダマン': ['SID0003163'],
 'BLEACH': ['SID0003144'],
 'BORUTO': ['SID0029042', 'SID0053510'],
 'D.Gray-man': ['SID0022319'],
 'DAYS': ['SID0025580', 'SID0054283'],
 'DRAGON QUEST ダイの大冒険': ['SID0027069',
  'SID0044873',
  'SID0050918',
  'SID0054518',
  'SID0054519',
  'SID0054520'],
 'Dr.スランプ アラレちゃん': ['SID0013234', 'SID0013235', 'SID0013236', 'SID0013237'],
 'EDENS ZERO': ['SID0056092'],
 'FAIRY TAIL': ['SID0003079',
  'SID0029354',
  'SID0032234',
  'SID0037381',
  'SID0057458'],
 'H2': ['SID0042432'],
 'HUNTER×HUNTER': ['SID0002812', 'SID0002956', 'SID0011645'],
 'LEGO': ['SID0017681',
  'SID0017704',
  'SID0017986',
  'SID0022373',
  'SID0024363',
  'SID0025485',
  'SID0026085',
  'SID0026086',
  'SID0027621',
  'SID0027622',
  'SID0028869',
  'SID0029280',
  'SID0029843',
  'SID0029844',
  'SID0029967',
  'SID0032303',
  'SID0032352'],
 'Mrペンペン': ['SID0004900', 'SID0012549'],
 'NARUTO': ['SID0014715',
  'SID0016111',
  '

In [4]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [34]:
class Ranker:
    def __init__(self, model_path):
        self.collab_model = ImplicitModel()
        self.collab_model.load_model(model_path)

    def rank(self, target_users, target_items, N=200, filter_already_liked_items=True, batch_size=10000):
        """
        :param target_users: UID list, [uid1, uid2, ...]; None means all users in matrix
        :param target_items: SID list  [sid1, sid2, ...]; None means all items in matrix
        :param N: minimum nb of output
        :yield: user_id, reco list [SID1, SID2, ...]
        """
        # filtering out users who are not in matrix
        filtered_users = self.collab_model.filter_users(target_users)
        filtered_users_ids = list(map(lambda x: self.collab_model.model.user_item_matrix.user2id[x], filtered_users))

        # filtering out items who are not in matrix
        filtered_items = self.collab_model.filter_items(target_items)
        filtered_items_ids = list(map(lambda x: self.collab_model.model.user_item_matrix.item2id[x], filtered_items))
        item_vector = self.collab_model.model.item_factors.take(filtered_items_ids, axis=0)

        if len(filtered_users_ids) == 0 or len(filtered_items_ids) == 0:
            logging.debug("not filtered_users_ids or not item_vector")
            return None, None

        if filter_already_liked_items:
            # seen matrix
            filtered_seen_matrix = self.collab_model.model.user_item_matrix.matrix[:, filtered_items_ids]

            for sub_user_list in batch(filtered_users_ids, batch_size):
                sub_users = self.collab_model.model.user_factors.take(sub_user_list, axis=0)
                subset_users_seen_items = filtered_seen_matrix[sub_user_list, :].todense()

                ordered_indexes, first_item_seen_index = reranker_x(sub_users, item_vector, filtered_items_ids,
                                                                    subset_users_seen_items, filter_seen=True)

                for user_index, reco in zip(sub_user_list, self.collab_model.defactorize_sids(ordered_indexes)):
                    yield self.collab_model.model.user_item_matrix.id2user[user_index], list(reco)[:N]
        else:
            for sub_user_list in batch(filtered_users_ids, batch_size):
                sub_users = self.collab_model.model.user_factors.take(sub_user_list, axis=0)
                ordered_indexes, _ = reranker_x(sub_users, item_vector, filtered_items_ids)
                for user_index, reco in zip(sub_user_list, self.collab_model.defactorize_sids(ordered_indexes)):
                    yield self.collab_model.model.user_item_matrix.id2user[user_index], list(reco)[:N]
    
    def get_users_in_model(self, target_users):
        filtered_users = self.collab_model.filter_users(target_users)
        print(f"{len(target_users)} -> {len(filtered_users)}")
        return filtered_users
    
    def fet_score(self, target_users, target_items, batch_size=10000):
        """
        :param target_users: UID list, [uid1, uid2, ...]; None means all users in matrix
        :param target_items: SID list  [sid1, sid2, ...]; None means all items in matrix
        :param N: minimum nb of output
        :yield: user_id, reco list [SID1, SID2, ...]
        """
        # filtering out users who are not in matrix
        filtered_users = self.collab_model.filter_users(target_users)
        filtered_users_ids = list(map(lambda x: self.collab_model.model.user_item_matrix.user2id[x], filtered_users))

        # filtering out items who are not in matrix
        filtered_items = self.collab_model.filter_items(target_items)
        filtered_items_ids = list(map(lambda x: self.collab_model.model.user_item_matrix.item2id[x], filtered_items))
        item_vector = self.collab_model.model.item_factors.take(filtered_items_ids, axis=0)

        if len(filtered_users_ids) == 0 or len(filtered_items_ids) == 0:
            logging.debug("not filtered_users_ids or not item_vector")
            return None, None
        
        for sub_user_list in batch(filtered_users_ids, batch_size):
            user_vectors = self.collab_model.model.user_factors.take(sub_user_list, axis=0)
            scores = user_vectors.dot(item_vector.T)
            yield np.mean(scores, axis=1)


In [35]:
model = Ranker('../data/als_model.2021-08-17')

In [12]:
target_users = ['C0000001151', 'C0000002033', 'C0000002736', 'C0000003151', 'PM015288972']
target_items = m['NARUTO']
target_items[:5]

['SID0014715', 'SID0016111', 'SID0017723', 'SID0017724', 'SID0017725']

In [17]:
filtered_items = model.collab_model.filter_items(target_items)
filtered_items_ids = list(map(lambda x: model.collab_model.model.user_item_matrix.item2id[x], filtered_items))
item_vector = model.collab_model.model.item_factors.take(filtered_items_ids, axis=0)
item_vector.shape

(9, 128)

In [19]:
filtered_users = model.collab_model.filter_users(target_users)
filtered_users_ids = list(map(lambda x: model.collab_model.model.user_item_matrix.user2id[x], filtered_users))
user_vectors = model.collab_model.model.user_factors.take(filtered_users_ids, axis=0)
user_vectors.shape

(3, 128)

In [22]:
scores = user_vectors.dot(item_vector.T)
scores.shape

(3, 9)

In [23]:
scores

array([[ 1.86445281e-01,  1.94047570e-01,  1.18854560e-01,
         1.09066501e-01,  1.07480153e-01,  1.22063398e-01,
         1.11321300e-01,  1.62817270e-01,  2.28578866e-01],
       [-8.14333260e-02,  1.30084753e-02, -3.87715548e-03,
        -8.50763172e-03, -6.28020614e-03,  9.76845622e-05,
        -7.59238005e-03,  3.02970409e-03, -1.33772939e-02],
       [-9.09167081e-02,  1.96700752e-01, -9.37194973e-02,
        -1.11245431e-01, -1.02001935e-01, -1.08131357e-01,
        -1.19227931e-01, -1.06158778e-01,  2.36220136e-01]], dtype=float32)

In [25]:
np.mean(scores, axis=1)

array([ 0.14896387, -0.01165913, -0.03316453], dtype=float32)

In [36]:
list(model.fet_score(target_users, m['NARUTO']))

[array([ 0.07438   , -0.04667298, -0.03743152, ..., -0.02305913,
        -0.02837129, -0.06134889], dtype=float32)]

## batch processing

In [9]:
target_users = []
for line in efficient_reading("../data/superusers.csv"):
    target_users.append(line.rstrip())

target_users[:5]

['CM020027571', 'PM011378827', 'PM011697362', 'P0001491232', 'P0000687634']

In [25]:
model_users = model.get_users_in_model(target_users)

47292 -> 4519


In [30]:
character_sids_dict

{'12歳。〜ちっちゃなムネのトキメキ〜': ['SID0036686', 'SID0036687'],
 'B-伝説！バトルビーダマン': ['SID0003163'],
 'BLEACH': ['SID0003144'],
 'BORUTO': ['SID0029042', 'SID0053510'],
 'D.Gray-man': ['SID0022319'],
 'DAYS': ['SID0025580', 'SID0054283'],
 'DRAGON QUEST ダイの大冒険': ['SID0027069',
  'SID0044873',
  'SID0050918',
  'SID0054518',
  'SID0054519',
  'SID0054520'],
 'Dr.スランプ アラレちゃん': ['SID0013234', 'SID0013235', 'SID0013236', 'SID0013237'],
 'EDENS ZERO': ['SID0056092'],
 'FAIRY TAIL': ['SID0003079',
  'SID0029354',
  'SID0032234',
  'SID0037381',
  'SID0057458'],
 'H2': ['SID0042432'],
 'HUNTER×HUNTER': ['SID0002812', 'SID0002956', 'SID0011645'],
 'LEGO': ['SID0017681',
  'SID0017704',
  'SID0017986',
  'SID0022373',
  'SID0024363',
  'SID0025485',
  'SID0026085',
  'SID0026086',
  'SID0027621',
  'SID0027622',
  'SID0028869',
  'SID0029280',
  'SID0029843',
  'SID0029844',
  'SID0029967',
  'SID0032303',
  'SID0032352'],
 'Mrペンペン': ['SID0004900', 'SID0012549'],
 'NARUTO': ['SID0014715',
  'SID0016111',
  '

In [73]:
character_scores = []
# have to make sure all users are in the model before
for character, sids in character_sids_dict.items(): 
    scores = np.array([])
    for batch_score in model.fet_score(model_users, sids):
        scores = np.concatenate((scores, batch_score), axis=None)
    
    # skip empty SIDs
    if scores.size != 0:
        character_scores.append(scores)

In [74]:
np.array(character_scores).shape

(281, 4519)

In [76]:
character_scores

[array([-0.02291957, -0.04808552,  0.01807972, ..., -0.01291187,
        -0.01753542,  0.00531442]),
 array([-0.00323607,  0.00061185,  0.00897361, ..., -0.00174869,
        -0.00130691, -0.00336467]),
 array([ 0.14795324,  0.24278997,  0.10431772, ..., -0.15334444,
        -0.07732883,  0.1033435 ]),
 array([ 0.14144954,  0.11859411, -0.0991697 , ..., -0.11316235,
        -0.08406777,  0.03536111]),
 array([ 0.05630678,  0.04220729,  0.15841681, ...,  0.00235904,
         0.19473878, -0.05747926]),
 array([ 0.04164778, -0.01612507,  0.01779002, ...,  0.16850479,
         0.08747819,  0.00280699]),
 array([ 0.07329465,  0.16591452,  0.04662048, ...,  0.00481821,
        -0.05563949,  0.02577998]),
 array([ 0.08940393,  0.05354747, -0.04474488, ..., -0.02078481,
        -0.07258218,  0.15257408]),
 array([-0.04170677, -0.00411749, -0.06483145, ..., -0.01392621,
         0.01479657,  0.03489715]),
 array([-0.03610528,  0.16454944, -0.04985578, ...,  0.08997347,
         0.07892321,  0.00

In [12]:
a = np.array([1,2,3])

In [16]:
np.stack((a,a), axis=1).shape

(3, 2)

In [17]:
np.concatenate((a, a), axis=None)

array([1, 2, 3, 1, 2, 3])

In [7]:
def efficient_reading(input_path, with_header=True, header_format=None):
    """
    yield one line at once til go through entire file,

    & check the header_format
    """
    with open(input_path, 'r') as r:
        if with_header:
            header = r.readline().rstrip()
            logging.debug(f"reading file whose format is  {header}")
            if header_format:
                assert header == header_format, f"Header Format is WRONG"
        while True:
            line = r.readline()
            if line:
                yield line
            else:
                break