In [1]:
import numpy as np
import pandas as pd 
import torch
import transformers
from transformers import AutoTokenizer, BertForSequenceClassification
import mojimoji
import re
import collections
import torchtext
from tqdm import tqdm
from torchtext.data import Field, Dataset, Example
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from ipywidgets import IntProgress

from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

In [2]:
cuda = torch.device('cuda') 

In [3]:
def deal_long_document(inputs, index, net_1, tfidf_weight):
    doc_vec = torch.zeros(768)
    for x in range(inputs["input_ids"].shape[0] // 512 + 1):
        input_sub = {}
        try:
            input_sub["input_ids"] = inputs["input_ids"][:,x*512:(x+1)*512]
            input_sub["token_type_ids"] = inputs["token_type_ids"][:,x*512:(x+1)*512] 
            input_sub["attention_mask"] = inputs["attention_mask"][:,x*512:(x+1)*512]
            outputs = net_1(**input_sub)
            vec = torch.Tensor(tfidf_weight[index][x*512:(x+1)*512]).matmul(outputs.hidden_states[0][0]).data
        except:
            input_sub["input_ids"] = inputs["input_ids"][:,x*512:]
            input_sub["token_type_ids"] = inputs["token_type_ids"][:,x*512:] 
            input_sub["attention_mask"] = inputs["attention_mask"][:,x*512:]  
            outputs = net_1(**input_sub)
            vec = torch.Tensor(tfidf_weight[index][x*512:]).matmul(outputs.hidden_states[0][0]).data
        doc_vec += vec
    return doc_vec

def Calculate_item_embedding(source, target):
    # Download the pretrained BERT model.
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    net_1 = transformers.BertForPreTraining.from_pretrained('bert-base-uncased', output_hidden_states=True)    
    net_1.eval()
    if source == "ML":
        df_movies = pd.read_csv("./Datasets/Side_MLasS.csv")
    if source == "AM":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzMasS.csv")
    if source == "AB":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzBasS.csv")     
        
    if target == "ML":
        df_movies = pd.read_csv("./Datasets/Side_MLasT.csv")
    if target == "AM":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzMasT.csv")
    if target == "AB":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzBasT.csv")         
        
    dict_id2descripton = np.load("/home/lizhi/CDRS-GNN/" + "Dict_MovieId2description.npy", allow_pickle=True).item()
    df_movies["description"] = df_movies.movieId.map(lambda x: dict_id2descripton[x])
    df_movies["tokenized_word_id"] = df_movies.description.map(lambda x: tokenizer(x, return_tensors="pt")["input_ids"].tolist())
    
    df_AmazonMovie_sub["description"] = df_AmazonMovie_sub.description.map(lambda x: str(x))
    df_AmazonMovie_sub["tokenized_word_id"] = df_AmazonMovie_sub.description.map(lambda x: tokenizer(str(x), return_tensors="pt")["input_ids"].tolist())
    
    tfidf_weight = CalculateTFidf(df_movies, df_AmazonMovie_sub)
    
    document = df_movies.description.values
    document2 = df_AmazonMovie_sub.description.values
    documents = np.concatenate((document, document2), axis=0)

    vec_list = []
    for i in tqdm(range(len(documents))):
        inputs = tokenizer(documents[i], return_tensors="pt")
        if inputs["input_ids"].shape[1] <= 512: 
            outputs = net_1(**inputs)
            doc_vec = torch.Tensor(tfidf_weight[i]).matmul(outputs.hidden_states[0][0]).data
        else:
            doc_vec = deal_long_document(inputs, i, net_1, tfidf_weight)
        vec_list.append(list(doc_vec.data / inputs["input_ids"].shape[1]))

    vec_list_2 = []
    for vector in vec_list:
        vec_list_2.append([float(value) for value in vector])

    dict_MLMovie2vec = dict(zip(df_movies.movieId.values, vec_list_2[:len(df_movies.movieId.values)]))
    dict_AmazonMovie2vec = dict(zip(df_AmazonMovie_sub.deal_id.values, vec_list_2[len(df_movies.movieId.values):]))
    
    #np.save("Dict_item2vec_A.npy", dict_AmazonMovie2vec)
    #np.save("Dict_item2vec_M.npy", dict_MLMovie2vec)   
    if source == "ML":
        return dict_MLMovie2vec, dict_AmazonMovie2vec
    if target == "ML":
        return dict_AmazonMovie2vec, dict_MLMovie2vec
    else:
        print("Error: No ML dataset.")
        return 0., 0.

def CalculateTFidf(df_movies, df_AmazonMovie_sub):
    dataset = df_movies.tokenized_word_id.values
    dataset2 = df_AmazonMovie_sub.tokenized_word_id.values
    data = np.concatenate((dataset, dataset2), axis=0)

    data_str = list(map(lambda x:x[0],data))
    for i in range(len(data_str)):
        for j in range(len(data_str[i])):
            data_str[i][j] = str(data_str[i][j])
    dictionary = Dictionary(data_str)
    corpus = list(map(dictionary.doc2bow,data_str))
    model = TfidfModel(corpus)
    corpus_tfidf = model[corpus]
    tfidf_weight = []
    for i in tqdm(range(len(data_str))):
        tfidf_vec = []
        dict_id_2_tfidf = dict(zip([x[0] for x in corpus_tfidf[i]], [x[1] for x in corpus_tfidf[i]]))
        for token in dictionary.doc2idx(data_str[i]):
            try:
                tfidf_vec.append(dict_id_2_tfidf[token])
            except:
                tfidf_vec.append(0.)
        tfidf_weight.append(tfidf_vec)
    return tfidf_weight

In [19]:
def deal_long_document_lastlayer(inputs, index, net_1):
    doc_vec = torch.zeros(768).cuda()
    for x in range(inputs["input_ids"].shape[0] // 512 + 1):
        input_sub = {}
        try:
            input_sub["input_ids"] = inputs["input_ids"][:,x*512:(x+1)*512]
            input_sub["token_type_ids"] = inputs["token_type_ids"][:,x*512:(x+1)*512] 
            input_sub["attention_mask"] = inputs["attention_mask"][:,x*512:(x+1)*512]
            vec = net_1(**input_sub).hidden_states[-1][0].mean(dim=0).data
        except:
            input_sub["input_ids"] = inputs["input_ids"][:,x*512:]
            input_sub["token_type_ids"] = inputs["token_type_ids"][:,x*512:] 
            input_sub["attention_mask"] = inputs["attention_mask"][:,x*512:]  
            vec = net_1(**input_sub).hidden_states[-1][0].mean(dim=0).data
        doc_vec += vec
    return doc_vec

In [9]:
CUDA_VISIBLE_DEVICES=1
torch.cuda.set_device(1)

In [24]:
def Calculate_item_embedding_lastlayer(source, target):
    # Download the pretrained BERT model.
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    net_1 = transformers.BertForPreTraining.from_pretrained('bert-base-uncased', output_hidden_states=True)    
    net_1.eval()
    net_1.cuda()
    if source == "ML":
        df_movies = pd.read_csv("./Datasets/Side_MLasS.csv")
    if source == "AM":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzMasS.csv")
    if source == "AB":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzBasS.csv")     
        
    if target == "ML":
        df_movies = pd.read_csv("./Datasets/Side_MLasT.csv")
    if target == "AM":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzMasT.csv")
    if target == "AB":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzBasT.csv")         
        
    dict_id2descripton = np.load("/home/lizhi/CDRS-GNN/" + "Dict_MovieId2description.npy", allow_pickle=True).item()
    df_movies["description"] = df_movies.movieId.map(lambda x: dict_id2descripton[x])
    #df_movies["tokenized_word_id"] = df_movies.description.map(lambda x: tokenizer(x, return_tensors="pt")["input_ids"].tolist())
    
    df_AmazonMovie_sub["description"] = df_AmazonMovie_sub.description.map(lambda x: str(x))
    #df_AmazonMovie_sub["tokenized_word_id"] = df_AmazonMovie_sub.description.map(lambda x: tokenizer(str(x), return_tensors="pt")["input_ids"].tolist())
    
    #tfidf_weight = CalculateTFidf(df_movies, df_AmazonMovie_sub)
    
    document = df_movies.description.values
    document2 = df_AmazonMovie_sub.description.values
    documents = np.concatenate((document, document2), axis=0)

    vec_list = []
    '''
    for i in tqdm(range(len(documents))):
        inputs = tokenizer(documents[i], return_tensors="pt")
        if inputs["input_ids"].shape[1] <= 512: 
            outputs = net_1(**inputs)
            doc_vec = torch.Tensor(tfidf_weight[i]).matmul(outputs.hidden_states[-1][0]).data
        else:
            doc_vec = deal_long_document(inputs, i, net_1, tfidf_weight)
        vec_list.append(list(doc_vec.data / inputs["input_ids"].shape[1]))
    '''
    
    for i in tqdm(range(len(documents))):
        sentences = documents[i].split(".")
        doc_vec = torch.zeros(768).cuda()
        for j in range(len(sentences)):
            if len(sentences[j])<5:
                continue
            inputs = tokenizer(sentences[j], return_tensors="pt")
            inputs["input_ids"] = inputs["input_ids"].cuda()
            inputs["token_type_ids"] = inputs["token_type_ids"].cuda()
            inputs["attention_mask"] = inputs["attention_mask"].cuda()
            if len(sentences[j])<=512:
                outputs = net_1(**inputs).hidden_states[-1][0].mean(dim=0).data
            else:
                outputs = deal_long_document_lastlayer(inputs, i, net_1)
           
            doc_vec += outputs
            
        vec_list.append(list(doc_vec.cpu() / len(sentences)))
    
    vec_list_2 = []
    for vector in vec_list:
        vec_list_2.append([float(value) for value in vector])

    dict_MLMovie2vec = dict(zip(df_movies.movieId.values, vec_list_2[:len(df_movies.movieId.values)]))
    dict_AmazonMovie2vec = dict(zip(df_AmazonMovie_sub.deal_id.values, vec_list_2[len(df_movies.movieId.values):]))
    
    #np.save("Dict_item2vec_A.npy", dict_AmazonMovie2vec)
    #np.save("Dict_item2vec_M.npy", dict_MLMovie2vec)   
    if source == "ML":
        return dict_MLMovie2vec, dict_AmazonMovie2vec
    if target == "ML":
        return dict_AmazonMovie2vec, dict_MLMovie2vec
    else:
        print("Error: No ML dataset.")
        return 0., 0.

In [4]:
def ItemClustering(dict_itemId2vec_A, dict_itemId2vec_M, num_clusters):

    dict_itemindex2vec_M = dict(zip(np.arange(len(dict_itemId2vec_M)), dict_itemId2vec_M.values()))
    dict_itemindex2vec_A = dict(zip(np.arange(len(dict_itemId2vec_A)), dict_itemId2vec_A.values()))

    dict_itemindex2ID_M = dict(zip(np.arange(len(dict_itemId2vec_M)), dict_itemId2vec_M.keys()))
    dict_itemindex2ID_A = dict(zip(np.arange(len(dict_itemId2vec_A)), dict_itemId2vec_A.keys()))

    dict_M_sample = {}
    for item in range(len(dict_itemId2vec_M)):
        dict_M_sample[item] = dict_itemindex2vec_M[item]
    dict_A_sample = {}
    for item in range(len(dict_itemId2vec_A)):
        dict_A_sample[item] = dict_itemindex2vec_A[item]    

    Jointed_items = list(dict_M_sample.values())
    Jointed_items.extend(dict_A_sample.values())   
    jointed_norm = preprocessing.normalize(Jointed_items, norm='l2')

    kmeans = KMeans(n_clusters=num_clusters, random_state=6298).fit(jointed_norm)
    lables = kmeans.predict(jointed_norm)

    #gmm = GaussianMixture(n_components=num_clusters).fit(jointed_norm)
    #lables = gmm.predict(jointed_norm)

    Cluster_M = dict(pd.value_counts(lables[:len(dict_M_sample)]))
    Cluster_A = dict(pd.value_counts(lables[len(dict_M_sample):]))

    dict_itemId2Cluster_A = dict(zip(dict_itemId2vec_A.keys(), lables[len(dict_M_sample):]))
    dict_itemId2Cluster_M = dict(zip(dict_itemId2vec_M.keys(), lables[:len(dict_M_sample)]))

    #np.save("Dict_itemId2Cluster_T.npy", dict_itemId2Cluster_A)
    #np.save("Dict_itemId2Cluster_S.npy", dict_itemId2Cluster_M)
    
    
    count = np.zeros([num_clusters])
    cluster_matrix = np.zeros([num_clusters,768])
    for key, values in zip(dict_itemId2Cluster_A.keys(), dict_itemId2Cluster_A.values()):
        cluster_matrix[values] += dict_itemId2vec_A[key]
        count[values] += 1
    for key, values in zip(dict_itemId2Cluster_M.keys(), dict_itemId2Cluster_M.values()):
        cluster_matrix[values] += dict_itemId2vec_M[key]
        count[values] += 1
    
    for i in range(num_clusters):
        cluster_matrix[i] = cluster_matrix[i] / count[i]
    dict_clusterId2vec = dict(zip(np.arange(num_clusters), cluster_matrix))
    #np.save("Dict_clusterId2vec.npy", dict_clusterId2vec)
    
    return dict_itemId2Cluster_A, dict_itemId2Cluster_M, dict_clusterId2vec

In [5]:
def Calculate_item_embedding_2(source, target):
    # Download the pretrained BERT model.
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    net_1 = transformers.BertForPreTraining.from_pretrained('bert-base-uncased', output_hidden_states=True)
    net_1.eval()
    if source == "AM":
        df_movies = pd.read_csv("./Datasets/Side_AmzMasS.csv")
    if source == "AB":
        df_movies = pd.read_csv("./Datasets/Side_AmzBasS.csv")     
        
    if target == "AM":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzMasT.csv")
    if target == "AB":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzBasT.csv")         
        
    dict_id2descripton = np.load("/home/lizhi/CDRS-GNN/" + "Dict_MovieId2description.npy", allow_pickle=True).item()
    df_movies["description"] = df_movies.description.map(lambda x: str(x))
    df_movies["tokenized_word_id"] = df_movies.description.map(lambda x: tokenizer(str(x), return_tensors="pt")["input_ids"].tolist())
    
    df_AmazonMovie_sub["description"] = df_AmazonMovie_sub.description.map(lambda x: str(x))
    df_AmazonMovie_sub["tokenized_word_id"] = df_AmazonMovie_sub.description.map(lambda x: tokenizer(str(x), return_tensors="pt")["input_ids"].tolist())
    
    tfidf_weight = CalculateTFidf(df_movies, df_AmazonMovie_sub)
    
    document = df_movies.description.values
    document2 = df_AmazonMovie_sub.description.values
    documents = np.concatenate((document, document2), axis=0)

    vec_list = []
    for i in tqdm(range(len(documents))):
        inputs = tokenizer(documents[i], return_tensors="pt")
        if inputs["input_ids"].shape[1] <= 512: 
            outputs = net_1(**inputs)
            doc_vec = torch.Tensor(tfidf_weight[i]).matmul(outputs.hidden_states[0][0]).data
        else:
            doc_vec = deal_long_document(inputs, i, net_1, tfidf_weight)
        vec_list.append(list(doc_vec.data / inputs["input_ids"].shape[1]))

    vec_list_2 = []
    for vector in vec_list:
        vec_list_2.append([float(value) for value in vector])

    dict_MLMovie2vec = dict(zip(df_movies.deal_id.values, vec_list_2[:len(df_movies.deal_id.values)]))
    dict_AmazonMovie2vec = dict(zip(df_AmazonMovie_sub.deal_id.values, vec_list_2[len(df_movies.deal_id.values):]))
    
    #np.save("Dict_item2vec_A.npy", dict_AmazonMovie2vec)
    #np.save("Dict_item2vec_M.npy", dict_MLMovie2vec)   

    return dict_MLMovie2vec, dict_AmazonMovie2vec

In [6]:
# domians = {"ML", "AM", "AB"}
source = "AM"
target = "ML"
dict_itemId2vec_s, dict_itemId2vec_t = Calculate_item_embedding(source, target)

save_path = "./Dictionary/MLtoAM/"

np.save(save_path+"Dict_item2vec_AMasT.npy",dict_itemId2vec_t)
np.save(save_path+"Dict_item2vec_MLasS.npy", dict_itemId2vec_s) 

dict_itemId2Cluster_s, dict_itemId2Cluster_t, dict_clusterId2vec = ItemClustering(dict_itemId2vec_s, dict_itemId2vec_t, 200)

np.save(save_path+"Dict_item2cluster_MLasS",dict_itemId2Cluster_s)
np.save(save_path+"Dict_item2cluster_AMasT",dict_itemId2Cluster_t)
np.save(save_path+"Dict_cluster2vec_MLtoAM",dict_clusterId2vec)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [11]:
# domians = {"ML", "AM", "AB"}
source = "AM"
target = "ML"
dict_itemId2vec_s, dict_itemId2vec_t = Calculate_item_embedding(source, target)

save_path = f"./Dictionary/{source}to{target}/"

np.save(save_path+f"Dict_item2vec_{target}asT.npy",dict_itemId2vec_t)
np.save(save_path+f"Dict_item2vec_{source}asS.npy", dict_itemId2vec_s) 

dict_itemId2Cluster_s, dict_itemId2Cluster_t, dict_clusterId2vec = ItemClustering(dict_itemId2vec_s, dict_itemId2vec_t, 200)

np.save(save_path+f"Dict_item2cluster_{source}asS",dict_itemId2Cluster_s)
np.save(save_path+f"Dict_item2cluster_{target}asT",dict_itemId2Cluster_t)
np.save(save_path+f"Dict_cluster2vec_{source}to{target}",dict_clusterId2vec)

KeyboardInterrupt: 

In [25]:
# domians = {"ML", "AM", "AB"}
source = "AB"
target = "ML"
dict_itemId2vec_s, dict_itemId2vec_t = Calculate_item_embedding_lastlayer(source, target)

save_path = f"./Dictionary/{source}to{target}/"

np.save(save_path+f"Dict_item2vec_{target}asT_lastlayer.npy",dict_itemId2vec_t)
np.save(save_path+f"Dict_item2vec_{source}asS_lastlayer.npy", dict_itemId2vec_s) 

dict_itemId2Cluster_s, dict_itemId2Cluster_t, dict_clusterId2vec = ItemClustering(dict_itemId2vec_s, dict_itemId2vec_t, 200)

np.save(save_path+f"Dict_item2cluster_{source}asS_lastlayer",dict_itemId2Cluster_s)
np.save(save_path+f"Dict_item2cluster_{target}asT_lastlayer",dict_itemId2Cluster_t)
np.save(save_path+f"Dict_cluster2vec_{source}to{target}_lastlayer",dict_clusterId2vec)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 47%|████▋     | 10689/22581 [11:50<1:12:35,  2.73it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 22581/22581 [2:21:04<00:00,  2.67it/s]   


In [53]:
# domians = {"ML", "AM", "AB"}
source = "ML"
target = "AB"
dict_itemId2vec_s, dict_itemId2vec_t = Calculate_item_embedding(source, target)

save_path = f"./Dictionary/{source}to{target}/"

np.save(save_path+f"Dict_item2vec_{target}asT.npy",dict_itemId2vec_t)
np.save(save_path+f"Dict_item2vec_{source}asS.npy", dict_itemId2vec_s) 

dict_itemId2Cluster_s, dict_itemId2Cluster_t, dict_clusterId2vec = ItemClustering(dict_itemId2vec_s, dict_itemId2vec_t, 200)

np.save(save_path+f"Dict_item2cluster_{source}asS",dict_itemId2Cluster_s)
np.save(save_path+f"Dict_item2cluster_{target}asT",dict_itemId2Cluster_t)
np.save(save_path+f"Dict_cluster2vec_{source}to{target}",dict_clusterId2vec)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (599 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 24912/24912 [00:43<00:00, 568.79it/s] 
100%|██████████| 24912/24912 [39:20<00:00, 10.55it/s]  


In [58]:
# domians = {"ML", "AM", "AB"}
source = "AM"
target = "AB"
dict_itemId2vec_s, dict_itemId2vec_t = Calculate_item_embedding_2(source, target)

save_path = f"./Dictionary/{source}to{target}/"

np.save(save_path+f"Dict_item2vec_{target}asT.npy",dict_itemId2vec_t)
np.save(save_path+f"Dict_item2vec_{source}asS.npy", dict_itemId2vec_s) 

dict_itemId2Cluster_s, dict_itemId2Cluster_t, dict_clusterId2vec = ItemClustering(dict_itemId2vec_s, dict_itemId2vec_t, 200)

np.save(save_path+f"Dict_item2cluster_{source}asS",dict_itemId2Cluster_s)
np.save(save_path+f"Dict_item2cluster_{target}asT",dict_itemId2Cluster_t)
np.save(save_path+f"Dict_cluster2vec_{source}to{target}",dict_clusterId2vec)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 18291/18291 [00:50<00:00, 360.65it/s]
100%|██████████| 18291/18291 [42:55<00:00,  7.10it/s]  


In [59]:
# domians = {"ML", "AM", "AB"}
source = "AB"
target = "AM"
dict_itemId2vec_s, dict_itemId2vec_t = Calculate_item_embedding_2(source, target)

save_path = f"./Dictionary/{source}to{target}/"

np.save(save_path+f"Dict_item2vec_{target}asT.npy",dict_itemId2vec_t)
np.save(save_path+f"Dict_item2vec_{source}asS.npy", dict_itemId2vec_s) 

dict_itemId2Cluster_s, dict_itemId2Cluster_t, dict_clusterId2vec = ItemClustering(dict_itemId2vec_s, dict_itemId2vec_t, 200)

np.save(save_path+f"Dict_item2cluster_{source}asS",dict_itemId2Cluster_s)
np.save(save_path+f"Dict_item2cluster_{target}asT",dict_itemId2Cluster_t)
np.save(save_path+f"Dict_cluster2vec_{source}to{target}",dict_clusterId2vec)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 19460/19460 [00:57<00:00, 336.49it/s]
100%|██████████| 19460/19460 [47:08<00:00,  6.88it/s]  


# Dataset with only overlapping users 

In [27]:
def Calculate_item_embedding_2(source, target):
    # Download the pretrained BERT model.
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    net_1 = transformers.BertForPreTraining.from_pretrained('bert-base-uncased', output_hidden_states=True)
    net_1.eval()
    if source == "AM":
        df_movies = pd.read_csv("./Datasets/Side_AmzMasS_wovlpu.csv")
    if source == "AB":
        df_movies = pd.read_csv("./Datasets/Side_AmzBasS_wovlpu.csv")     
        
    if target == "AM":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzMasT_wovlpu.csv")
    if target == "AB":
        df_AmazonMovie_sub = pd.read_csv("./Datasets/Side_AmzBasT_wovlpu.csv")         
        
    #dict_id2descripton = np.load("/home/lizhi/CDRS-GNN/" + "Dict_MovieId2description.npy", allow_pickle=True).item()
    df_movies["description"] = df_movies.description.map(lambda x: str(x))
    df_movies["tokenized_word_id"] = df_movies.description.map(lambda x: tokenizer(str(x), return_tensors="pt")["input_ids"].tolist())
    
    df_AmazonMovie_sub["description"] = df_AmazonMovie_sub.description.map(lambda x: str(x))
    df_AmazonMovie_sub["tokenized_word_id"] = df_AmazonMovie_sub.description.map(lambda x: tokenizer(str(x), return_tensors="pt")["input_ids"].tolist())
    
    tfidf_weight = CalculateTFidf(df_movies, df_AmazonMovie_sub)
    
    document = df_movies.description.values
    document2 = df_AmazonMovie_sub.description.values
    documents = np.concatenate((document, document2), axis=0)

    vec_list = []
    for i in tqdm(range(len(documents))):
        inputs = tokenizer(documents[i], return_tensors="pt")
        if inputs["input_ids"].shape[1] <= 512: 
            outputs = net_1(**inputs)
            doc_vec = torch.Tensor(tfidf_weight[i]).matmul(outputs.hidden_states[0][0]).data
        else:
            doc_vec = deal_long_document(inputs, i, net_1, tfidf_weight)
        vec_list.append(list(doc_vec.data / inputs["input_ids"].shape[1]))

    vec_list_2 = []
    for vector in vec_list:
        vec_list_2.append([float(value) for value in vector])

    dict_MLMovie2vec = dict(zip(df_movies.deal_id.values, vec_list_2[:len(df_movies.deal_id.values)]))
    dict_AmazonMovie2vec = dict(zip(df_AmazonMovie_sub.deal_id.values, vec_list_2[len(df_movies.deal_id.values):]))
    
    #np.save("Dict_item2vec_A.npy", dict_AmazonMovie2vec)
    #np.save("Dict_item2vec_M.npy", dict_MLMovie2vec)   

    return dict_MLMovie2vec, dict_AmazonMovie2vec

In [28]:
def ItemClustering(dict_itemId2vec_A, dict_itemId2vec_M, num_clusters):

    dict_itemindex2vec_M = dict(zip(np.arange(len(dict_itemId2vec_M)), dict_itemId2vec_M.values()))
    dict_itemindex2vec_A = dict(zip(np.arange(len(dict_itemId2vec_A)), dict_itemId2vec_A.values()))

    dict_itemindex2ID_M = dict(zip(np.arange(len(dict_itemId2vec_M)), dict_itemId2vec_M.keys()))
    dict_itemindex2ID_A = dict(zip(np.arange(len(dict_itemId2vec_A)), dict_itemId2vec_A.keys()))

    dict_M_sample = {}
    for item in range(len(dict_itemId2vec_M)):
        dict_M_sample[item] = dict_itemindex2vec_M[item]
    dict_A_sample = {}
    for item in range(len(dict_itemId2vec_A)):
        dict_A_sample[item] = dict_itemindex2vec_A[item]    

    Jointed_items = list(dict_M_sample.values())
    Jointed_items.extend(dict_A_sample.values())   
    jointed_norm = preprocessing.normalize(Jointed_items, norm='l2')

    kmeans = KMeans(n_clusters=num_clusters, random_state=6298).fit(jointed_norm)
    lables = kmeans.predict(jointed_norm)

    #gmm = GaussianMixture(n_components=num_clusters).fit(jointed_norm)
    #lables = gmm.predict(jointed_norm)

    Cluster_M = dict(pd.value_counts(lables[:len(dict_M_sample)]))
    Cluster_A = dict(pd.value_counts(lables[len(dict_M_sample):]))

    dict_itemId2Cluster_A = dict(zip(dict_itemId2vec_A.keys(), lables[len(dict_M_sample):]))
    dict_itemId2Cluster_M = dict(zip(dict_itemId2vec_M.keys(), lables[:len(dict_M_sample)]))

    #np.save("Dict_itemId2Cluster_T.npy", dict_itemId2Cluster_A)
    #np.save("Dict_itemId2Cluster_S.npy", dict_itemId2Cluster_M)
    
    
    count = np.zeros([num_clusters])
    cluster_matrix = np.zeros([num_clusters,768])
    for key, values in zip(dict_itemId2Cluster_A.keys(), dict_itemId2Cluster_A.values()):
        cluster_matrix[values] += dict_itemId2vec_A[key]
        count[values] += 1
    for key, values in zip(dict_itemId2Cluster_M.keys(), dict_itemId2Cluster_M.values()):
        cluster_matrix[values] += dict_itemId2vec_M[key]
        count[values] += 1
    
    for i in range(num_clusters):
        cluster_matrix[i] = cluster_matrix[i] / count[i]
    dict_clusterId2vec = dict(zip(np.arange(num_clusters), cluster_matrix))
    #np.save("Dict_clusterId2vec.npy", dict_clusterId2vec)
    
    return dict_itemId2Cluster_A, dict_itemId2Cluster_M, dict_clusterId2vec

In [34]:
# domians = {"ML", "AM", "AB"}
source = "AM"
target = "AB"
dict_itemId2vec_s, dict_itemId2vec_t = Calculate_item_embedding_2(source, target)

save_path = f"./Dictionary/{source}to{target}/"

np.save(save_path+f"Dict_item2vec_{target}asT.npy",dict_itemId2vec_t)
np.save(save_path+f"Dict_item2vec_{source}asS.npy", dict_itemId2vec_s) 

dict_itemId2Cluster_s, dict_itemId2Cluster_t, dict_clusterId2vec = ItemClustering(dict_itemId2vec_s, dict_itemId2vec_t, 100)

np.save(save_path+f"Dict_item2cluster_{source}asS_wovlpu",dict_itemId2Cluster_s)
np.save(save_path+f"Dict_item2cluster_{target}asT_wovlpu",dict_itemId2Cluster_t)
np.save(save_path+f"Dict_cluster2vec_{source}to{target}_wovlpu",dict_clusterId2vec)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 9704/9704 [00:24<00:00, 391.52it/s]
100%|██████████| 9704/9704 [20:33<00:00,  7.87it/s]


In [35]:
# domians = {"ML", "AM", "AB"}
source = "AB"
target = "AM"
dict_itemId2vec_s, dict_itemId2vec_t = Calculate_item_embedding_2(source, target)

save_path = f"./Dictionary/{source}to{target}/"

np.save(save_path+f"Dict_item2vec_{target}asT.npy",dict_itemId2vec_t)
np.save(save_path+f"Dict_item2vec_{source}asS.npy", dict_itemId2vec_s) 

dict_itemId2Cluster_s, dict_itemId2Cluster_t, dict_clusterId2vec = ItemClustering(dict_itemId2vec_s, dict_itemId2vec_t, 100)

np.save(save_path+f"Dict_item2cluster_{source}asS_wovlpu",dict_itemId2Cluster_s)
np.save(save_path+f"Dict_item2cluster_{target}asT_wovlpu",dict_itemId2Cluster_t)
np.save(save_path+f"Dict_cluster2vec_{source}to{target}_wovlpu",dict_clusterId2vec)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 6231/6231 [00:16<00:00, 386.09it/s]
100%|██████████| 6231/6231 [14:09<00:00,  7.34it/s]
