In [1]:
from google.cloud import storage
from google.oauth2 import service_account
import gensim
from gensim.parsing.preprocessing import preprocess_string
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel

import pickle

import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import random
from datetime import datetime
from time import time
import scipy.sparse as sp

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

# import warnings
# warnings.filterwarnings(action='ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
config = {
    ###### TopicModeling  ######
    'num_topics':20,
    'random_state':42,
    'passes':10,
    
    ###### lightgcn  ######
    'num_epochs' : 150,
    "reg" : 1e-5,
    'lr' : 0.0001,
    "emb_dim" : 20,
    "n_layers" : 3,
    'batch_size' : 50,
    "node_dropout" : 0.2,
    'valid_samples' : 2, # 검증에 사용할 sample 수
    'seed' : 22,
    'n_batch' : 10,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = Box(config)

## Preparing data

In [3]:
def preprocess(df):
    df = df[df['uri_first']==1]
    df['timestamp']=pd.to_datetime(df['local_time']).astype(int)//10**9
    df = df[['hashed_ip', 'products', 'timestamp']]

    df['user']=df['hashed_ip']
    df['item']=df['products']
    df['time']=df['timestamp']

    df.sort_values(['user', 'timestamp'])

    del df['hashed_ip'], df['products'], df['timestamp']
    user_interaction_counts = df['user'].value_counts()
    selected_users = user_interaction_counts[user_interaction_counts >= 5].index
    df = df[df['user'].isin(selected_users)]

    return df

In [4]:
def load_data(): 
    # LOAD ITEM2IDX PICKLE
    
    SERVICE_ACCOUNT_FILE = "/home/user/level3-416207-893f91c9529e (1).json"
    credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE)
    project_id = "level3-416207"
    storage_client = storage.Client(credentials=credentials, project=project_id)
    bucket_name = 'crwalnoti'
    bucket = storage_client.bucket(bucket_name)

    item2idx_name = '240320/item_to_idx.pickle'
    inter_name = '240320/inter_240129.csv'

    # prepare item2idx
    blob_item2idx = bucket.blob(item2idx_name)
    with blob_item2idx.open(mode='rb') as f:
        item2idx = pickle.load(f)
    
    # prepare interaction_df
    blob_inter = bucket.blob(inter_name)
    with blob_inter.open(mode='rb') as f:
        interaction_df = pd.read_csv(f)
        
    interaction_df = preprocess(interaction_df)

    return item2idx, interaction_df

## TopicModeling_DataSet

In [5]:
class MakeTMDataSet():
    def __init__(self):
        self.item2idx , self.df = load_data()
        
        self.df["item_idx"] = self.df["item"].map(self.item2idx)   
        self.df['item_name'] = self.df['item'].map(self.item2name())
        
        # inter_dict & df user 순서 주의
        self.inter_dict = self.df.groupby('user', sort=False)['item_name'].apply(set).apply(list).to_dict()
        self.user_ids = list(self.inter_dict.keys())
        self.user2idx = {user_id: index for index, user_id in enumerate(self.user_ids)}
        
        self.df["user_idx"] = self.df["user"].map(self.user2idx)
        
        self.num_item, self.num_user = len(self.item2idx), len(self.user2idx)
        
        self.dictionary, self.corpus = self.TM_traindata()
        
    def item2name(self):
        with open('/home/user/pickle/product_info_df.pickle', 'rb') as fr:
            product_info = pickle.load(fr)
            
        product_data = product_info.copy()
        product_data['title'] = product_data['title'].map(lambda x: x.replace("'",'').replace(',','').replace('(', ' ').replace(')', ' '))
        product_data['title'] = product_data['title'].map(lambda x: x.lower())
        product_data['title'] = product_data['title'].map(lambda x: x.split(' '))
        product_data['title'] = product_data['title'].map(lambda x: ' '.join(x).split())
        product_data['title'] = product_data['title'].map(lambda x: ' '.join(x))
        
        dict_products = product_data[['id','title']].set_index('id').to_dict()['title']
        
        return dict_products
    
    def TM_traindata(self):
        documents = list(self.inter_dict.values())
        dictionary = Dictionary(documents)
        corpus = [dictionary.doc2bow(document) for document in documents]
        return dictionary, corpus
    
    def get_dictionary(self):
        return self.dictionary
    
    def get_corpus(self):
        return self.corpus
 

## TRAIN TM

In [6]:
TM_dataset = MakeTMDataSet()
dictionary = TM_dataset.get_dictionary()
corpus = TM_dataset.get_corpus()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timestamp']=pd.to_datetime(df['local_time']).astype(int)//10**9


In [7]:
# 빈 리스트인 value가 존재하는지 확인
empty_values = {user_id: item_list for user_id, item_list in TM_dataset.inter_dict.items() if len(item_list) == 0}
print(f"len(empty_values): {len(empty_values)}")

len(empty_values): 0


In [8]:
lda_model=LdaModel(corpus=corpus, id2word=dictionary, 
                   num_topics=config.num_topics, 
                   random_state=config.random_state, 
                   passes=config.passes )

##### Check User Topic Vector

In [None]:
user_topic_vectors = [lda_model.get_document_topics(bow) for bow in corpus]
user_topic_vectors

In [None]:
# 임의 user vector 확인
doc_id = 153  # 사용자 ID
doc_bow = corpus[doc_id]
doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)

print(f"Document #{doc_id} Topics:")
for topic, prob in doc_topics:
    print(f"Topic {topic}: {prob}")

## Topic Modeling's VECTOR --> LightGCN

In [12]:
class MakeLightGCNDataSet():
    def __init__(self, TM_dataset, lda_model, config):
        self.config = config
        self.TM_dataset = TM_dataset
        self.lda_model = lda_model
        
        self.df = self.TM_dataset.df
        self.user2idx = self.TM_dataset.user2idx
        self.item2idx = self.TM_dataset.item2idx
        self.num_user, self.num_item = self.TM_dataset.num_user, self.TM_dataset.num_item
        
        self.exist_users = [i for i in range(self.num_user)]
        self.exist_items = [i for i in range(self.num_item)]
        
        self.user_train, self.user_valid = self.generate_sequence_data()
        self.R_train, self.R_valid, self.R_total = self.generate_dok_matrix()
        self.ngcf_adj_matrix = self.generate_ngcf_adj_matrix()
        
        self.user_topic_tensor = self.get_TM_user_vector()
        
        self.n_train = len(self.R_train)
        self.batch_size = self.config.batch_size
        
    def generate_sequence_data(self) -> dict:
        """
        split train/valid
        중복 허용
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['time']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)
            user_total = users[user]
            valid_indices = random.sample(range(len(user_total)), 2)
            valid = [user_total[idx] for idx in valid_indices]
            train = [user_total[idx] for idx in range(len(user_total)) if idx not in valid_indices]
            user_train[user] = train
            user_valid[user] = valid
        
        return user_train, user_valid
    
    def generate_dok_matrix(self):
        R_train = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        R_valid = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        R_total = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        user_list = self.exist_users   # user2idx에 있는 value값
        for user in user_list:
            train_items = self.user_train[user]
            valid_items = self.user_valid[user]
            
            for train_item in train_items:
                R_train[user, train_item] = 1.0
                R_total[user, train_item] = 1.0
            
            for valid_item in valid_items:
                R_valid[user, valid_item] = 1.0
                R_total[user, valid_item] = 1.0
        
        return R_train, R_valid, R_total

    def generate_ngcf_adj_matrix(self):
        adj_mat = sp.dok_matrix((self.num_user + self.num_item, self.num_user + self.num_item), dtype=np.float32)
        adj_mat = adj_mat.tolil() # to_list
        R = self.R_train.tolil()

        adj_mat[:self.num_user, self.num_user:] = R
        adj_mat[self.num_user:, :self.num_user] = R.T
        adj_mat = adj_mat.todok() # to_dok_matrix

        def normalized_adj_single(adj):
            rowsum = np.array(adj.sum(1))
            d_inv = np.power(rowsum, -.5).flatten()  
            d_inv[np.isinf(d_inv)] = 0.
            d_mat_inv = sp.diags(d_inv)
            norm_adj = d_mat_inv.dot(adj).dot(d_mat_inv)

            return norm_adj.tocoo()

        ngcf_adj_matrix = normalized_adj_single(adj_mat)
        return ngcf_adj_matrix.tocsr()

    def get_TM_user_vector(self):
        user_topic_matrix = np.zeros((self.num_user, self.config.num_topics))
        corpus = self.TM_dataset.get_corpus()
        
        user_topic_vectors = [self.lda_model.get_document_topics(bow, minimum_probability=0.0) 
                              for bow in corpus]
        for i, user_vec in enumerate(user_topic_vectors):
            """
                i: user idx
                user_vec: (topic, prob)
            """
            for topic, prob in user_vec:
                user_topic_matrix[i, topic] = prob

        # numpy array --> torch tensor
        user_topic_tensor = torch.tensor(user_topic_matrix, dtype=torch.float32)
        
        return user_topic_tensor

    def sampling(self):
        users = random.sample(self.exist_users, self.config.batch_size)

        def sample_pos_items_for_u(u, num):
            pos_items = self.user_train[u]
            pos_batch = random.sample(pos_items, num)
            return pos_batch
        
        def sample_neg_items_for_u(u, num):
            neg_items = list(set(self.exist_items) - set(self.user_train[u]))
            neg_batch = random.sample(neg_items, num)
            return neg_batch
        
        pos_items, neg_items = [], []
        for user in users:
            pos_items += sample_pos_items_for_u(user, 1)
            neg_items += sample_neg_items_for_u(user, 1)
        
        return users, pos_items, neg_items
        
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def get_R_data(self):
        return self.R_train, self.R_valid, self.R_total

    def get_ngcf_adj_matrix_data(self):
        return self.ngcf_adj_matrix
        

In [13]:
class LightGCN(nn.Module):
    def __init__(self, n_users, n_items, emb_dim, n_layers, reg, node_dropout, adj_mtx, user_topic_tensor):
        super().__init__()
        # initialize Class attributes
        self.n_users = n_users
        self.n_items = n_items
        self.emb_dim = emb_dim
        self.l = adj_mtx   # 인접 행렬, user-item interaction
        self.user_topic_tensor = user_topic_tensor.to(device)
        
        # PyTorch sparse tensor
        self.graph = self._convert_sp_mat_to_sp_tensor(self.l)

        self.reg = reg   # regularization
        self.n_layers = n_layers   # num of conv layers
        self.node_dropout = node_dropout

        # Initialize weights
        # TM + Xavier
        self.weight_dict = self._init_weights()
        print("Weights initialized.")

    def _init_weights(self):
        print("Initializing weights...")
        weight_dict = nn.ParameterDict() 

        initializer = torch.nn.init.xavier_uniform_
        
        weight_dict['user_embedding'] = nn.Parameter(self.user_topic_tensor)
        weight_dict['item_embedding'] = nn.Parameter(initializer(torch.empty(self.n_items, self.emb_dim).to(device)))

        return weight_dict

    # convert sparse matrix into sparse PyTorch tensor
    def _convert_sp_mat_to_sp_tensor(self, X):
        """
        Convert scipy sparse matrix to PyTorch sparse matrix

        Arguments:
        ----------
        X = Adjacency matrix, scipy sparse matrix
        """
        coo = X.tocoo().astype(np.float32)
        i = torch.LongTensor(np.mat([coo.row, coo.col]))   # 0이 아닌 원소의 위치 정보
        v = torch.FloatTensor(coo.data)   # 원소 값 정보
        # coo.shape: 원본 sparse matrix shape
        res = torch.sparse.FloatTensor(i, v, coo.shape).to(device)
        return res

    # apply node_dropout
    def _droupout_sparse(self, X):
        """
        Drop individual locations in X
        
        Arguments:
        ---------
        X = adjacency matrix (PyTorch sparse tensor)
        dropout = fraction of nodes to drop
        noise_shape = number of non non-zero entries of X
        """
        node_dropout_mask = ((self.node_dropout) + torch.rand(X._nnz())).floor().bool().to(device)
        i = X.coalesce().indices()   # 0이 아닌 원소 위치
        v = X.coalesce()._values()   # 0이 아닌 요소 값
        i[:,node_dropout_mask] = 0   # masking
        v[node_dropout_mask] = 0
        X_dropout = torch.sparse.FloatTensor(i, v, X.shape).to(X.device)

        return  X_dropout.mul(1/(1-self.node_dropout))

    def forward(self, u, i, j):
        """
        Computes the forward pass
        
        Arguments:
        ---------
        u = user
        i = positive item (user interacted with item)
        j = negative item (user did not interact with item)
        """
        # apply drop-out mask
        graph = self._droupout_sparse(self.graph) if self.node_dropout > 0 else self.graph

        ego_embeddings = torch.cat([self.weight_dict['user_embedding'], self.weight_dict['item_embedding']], 0)

        for k in range(self.n_layers):
            ego_embeddings = torch.sparse.mm(graph, ego_embeddings)
        
        u_emb, i_emb = ego_embeddings.split([self.n_users, self.n_items], 0)

        self.u_emb = u_emb
        self.i_emb = i_emb
        
        u_emb_batch = u_emb[u]  # (batch_size, emb_dim)
        pos_i_emb_batch = i_emb[i]  # (batch_size, emb_dim)
        neg_i_emb_batch = i_emb[j]  # (batch_size, emb_dim)
        

        # pos_scores & neg_scores
        pos_scores = torch.sum(u_emb_batch * pos_i_emb_batch, dim=1)  # (batch_size)
        neg_scores = torch.sum(u_emb_batch * neg_i_emb_batch, dim=1)  # (batch_size)
        
        # Concatenate pos & neg
        scores = torch.cat([pos_scores, neg_scores])  # (2*batch_size)
        labels = torch.cat([torch.ones_like(pos_scores), torch.zeros_like(neg_scores)])  # (2*batch_size)

        # Calculate BCEWithLogitsLoss
        loss_fn = torch.nn.BCEWithLogitsLoss()
        loss = loss_fn(scores, labels)

        return loss

In [14]:
lightgcn_dataset=MakeLightGCNDataSet(TM_dataset, lda_model, config)
ngcf_adj_matrix = lightgcn_dataset.get_ngcf_adj_matrix_data()
R_train, R_valid, R_total = lightgcn_dataset.get_R_data()

  d_inv = np.power(rowsum, -.5).flatten()


In [19]:
model = LightGCN(
    n_users = lightgcn_dataset.num_user,
    n_items = lightgcn_dataset.num_item,
    emb_dim = config.emb_dim,
    n_layers = config.n_layers,
    reg = config.reg,
    node_dropout = config.node_dropout,
    adj_mtx = ngcf_adj_matrix,
    user_topic_tensor = lightgcn_dataset.user_topic_tensor,
    ).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

Initializing weights...
Weights initialized.


### train

In [16]:
def train(model, make_graph_data_set, optimizer, n_batch):
    model.train()
    loss_val = 0
    for step in range(1, n_batch + 1):
        user, pos, neg = make_graph_data_set.sampling()
        optimizer.zero_grad()
        loss = model(user, pos, neg)
        loss.backward()
        optimizer.step()
        loss_val += loss.item()
    loss_val /= n_batch
    return loss_val

def split_matrix(X, n_splits=10):
    splits = []
    chunk_size = X.shape[0] // n_splits
    for i in range(n_splits):
        start = i * chunk_size
        end = X.shape[0] if i == n_splits - 1 else (i + 1) * chunk_size
        splits.append(X[start:end])
    return splits

def compute_ndcg_k(pred_items, test_items, test_indices, k):
    
    r = (test_items * pred_items).gather(1, test_indices)
    f = torch.from_numpy(np.log2(np.arange(2, k+2))).float().to(device)
    
    dcg = (r[:, :k]/f).sum(1)                                               
    dcg_max = (torch.sort(r, dim=1, descending=True)[0][:, :k]/f).sum(1)   
    ndcg = dcg/dcg_max                                                     
    
    ndcg[torch.isnan(ndcg)] = 0
    return ndcg

def evaluate(u_emb, i_emb, Rtr, Rte, k = 10):

    # split matrices
    ue_splits = split_matrix(u_emb)
    tr_splits = split_matrix(Rtr)
    te_splits = split_matrix(Rte)

    recall_k, ndcg_k= [], []
    # compute results for split matrices
    for ue_f, tr_f, te_f in zip(ue_splits, tr_splits, te_splits):

        scores = torch.mm(ue_f, i_emb.t())

        test_items = torch.from_numpy(te_f.todense()).float().to(device)
        non_train_items = torch.from_numpy(1-(tr_f.todense())).float().to(device)
        scores = scores * non_train_items

        _, test_indices = torch.topk(scores, dim=1, k=k)
        
        pred_items = torch.zeros_like(scores).float()
        pred_items.scatter_(dim=1, index=test_indices, src=torch.ones_like(test_indices).float().to(device))

        topk_preds = torch.zeros_like(scores).float()
        topk_preds.scatter_(dim=1, index=test_indices[:, :k], src=torch.ones_like(test_indices).float())
        
        TP = (test_items * topk_preds).sum(1)                      
        rec = TP/test_items.sum(1)
   
        ndcg = compute_ndcg_k(pred_items, test_items, test_indices, k)

        recall_k.append(rec)
        ndcg_k.append(ndcg)

    return torch.cat(ndcg_k).mean(), torch.cat(recall_k).mean()

In [None]:
best_hit = 0
for epoch in range(1, config.num_epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss = train(
            model = model, 
            make_graph_data_set = lightgcn_dataset, 
            optimizer = optimizer,
            n_batch = config.n_batch,
            )
        with torch.no_grad():
            ndcg, hit = evaluate(
                u_emb = model.u_emb.detach(), 
                i_emb = model.i_emb.detach(), 
                Rtr = R_train, 
                Rte = R_valid, 
                k = 10,
                )
        # if best_hit < hit:
        #     best_hit = hit
        #     torch.save(model.state_dict(), os.path.join(config.model_path, config.model_name))
        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')