# Data Collection

In [None]:
from tqdm import tqdm

number_of_source_users = 2500
number_of_source_items = 5000

number_of_target_users = 2500
number_of_target_items = 5000
target_overlap = 1000

dataset_name = "amazon-custom"
source_name = "books"
target_name = "movies-tv"

In [2]:
import os

source_folder_path = f"{dataset_name}-{source_name}"
target_folder_path = f"{dataset_name}-{target_name}"

os.makedirs(source_folder_path, exist_ok=True)
os.makedirs(target_folder_path, exist_ok=True)

In [3]:
source_file = []

with open(f"amazon-{source_name}/amazon-{source_name}.inter", "r", encoding="utf-8") as f:
    source_file = f.read().split("\n")[:-1]

In [4]:
target_file = []

with open(f"amazon-{target_name}/amazon-{target_name}.inter", "r", encoding="utf-8") as f:
    target_file = f.read().split("\n")[:-1]

## User and Item Selection

In [5]:
source_user_count = {}

for line in tqdm(source_file):
    try:
        source_user_count[line[:line.index("\t")]] += 1
    except:
        source_user_count[line[:line.index("\t")]] = 1

100%|██████████| 22507156/22507156 [00:25<00:00, 885556.79it/s]


In [6]:
source_item_count = {}

for line in tqdm(source_file):
    if line[line.index("\t")+1:line.index("\t", line.index("\t")+1)][0] != "B":
        try:
            source_item_count[line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]] += 1
        except:
            source_item_count[line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]] = 1

100%|██████████| 22507156/22507156 [00:29<00:00, 769680.01it/s] 


In [7]:
target_user_count = {}

for line in tqdm(target_file):
    try:
        target_user_count[line[:line.index("\t")]] += 1
    except:
        target_user_count[line[:line.index("\t")]] = 1

100%|██████████| 4607048/4607048 [00:04<00:00, 925160.22it/s] 


In [8]:
target_item_count = {}

for line in tqdm(target_file):
    if line[line.index("\t")+1:line.index("\t", line.index("\t")+1)][0] != "B":
        try:
            target_item_count[line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]] += 1
        except:
            target_item_count[line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]] = 1

100%|██████████| 4607048/4607048 [00:03<00:00, 1177576.69it/s]


In [9]:
import heapq

source_users = heapq.nlargest(len(source_user_count), source_user_count, key=source_user_count.get)
target_users = heapq.nlargest(len(target_user_count), target_user_count, key=target_user_count.get)

In [10]:
overlap_users = [None for _ in range(target_overlap)]

overlap_users_dict = {}

for user in tqdm(source_users):
    overlap_users_dict[user] = False

for user in tqdm(target_users):
    try:
        if not overlap_users_dict[user]:
            overlap_users_dict[user] = True
    except:
        overlap_users_dict[user] = False

L = min(len(source_users), len(target_users))
idx = 0

for i in tqdm(range(L)):
    if idx >= target_overlap:
        break

    if overlap_users_dict[source_users[i]] and source_users[i] not in overlap_users[:idx+1]:
        overlap_users[idx] = source_users[i]
        idx += 1
    
    if overlap_users_dict[target_users[i]] and target_users[i] not in overlap_users[:idx+1]:
        overlap_users[idx] = target_users[i]
        idx += 1

print(idx)
overlap_users = overlap_users[:idx]

100%|██████████| 8026325/8026325 [00:05<00:00, 1433687.72it/s]
100%|██████████| 2088621/2088621 [00:01<00:00, 1122785.05it/s]
  0%|          | 662/2088621 [00:00<01:12, 28731.68it/s]

1000





In [11]:
# remove overlapped users
for user in overlap_users:
    target_user_count[user] *= -1
    source_user_count[user] *= -1

remainder_source_users = heapq.nlargest(number_of_source_users-len(overlap_users), source_user_count, key=source_user_count.get)
remainder_target_users = heapq.nlargest(number_of_target_users-len(overlap_users), target_user_count, key=target_user_count.get)

source_users = overlap_users + remainder_source_users
target_users = overlap_users + remainder_target_users

In [12]:
source_items = heapq.nlargest(number_of_source_items, source_item_count, key=source_item_count.get)
target_items = heapq.nlargest(number_of_target_items, target_item_count, key=target_item_count.get)

## Data Extraction

In [13]:
source_inters = len(source_file)
target_inters = len(target_file)

In [14]:
source_user_dict = {}

for line in tqdm(source_file):
    source_user_dict[line[:line.index("\t")]] = False

for user in source_users:
    source_user_dict[user] = True

100%|██████████| 22507156/22507156 [00:20<00:00, 1098269.00it/s]


In [15]:
source_item_dict = {}

for line in tqdm(source_file):
    source_item_dict[line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]] = False

for item in source_items:
    source_item_dict[item] = True

100%|██████████| 22507156/22507156 [00:17<00:00, 1288117.49it/s]


In [16]:
file_data = [None for _ in range(source_inters)]
idx = 0
line_count = 0

for line in tqdm(source_file):
    if idx >= source_inters:
        break

    user = line[:line.index("\t")]
    item = line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]
    
    if source_user_dict[user] and source_item_dict[item]:
        file_data[idx] = line
        idx += 1

file_data = file_data[:idx]

100%|██████████| 22507156/22507156 [00:35<00:00, 637434.86it/s]


In [17]:
target_user_dict = {}

for line in tqdm(target_file[:-1]):
    target_user_dict[line[:line.index("\t")]] = False

for user in target_users:
    target_user_dict[user] = True

100%|██████████| 4607047/4607047 [00:03<00:00, 1217788.14it/s]


In [18]:
target_item_dict = {}

for line in tqdm(target_file):
    target_item_dict[line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]] = False

for item in target_items:
    target_item_dict[item] = True

100%|██████████| 4607048/4607048 [00:03<00:00, 1271059.02it/s]


In [19]:
file_data_2 = [None for _ in range(target_inters)]
idx = 0

for line in tqdm(target_file):
    if idx >= target_inters:
        break

    user = line[:line.index("\t")]
    item = line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]
    
    if target_user_dict[user] and  target_item_dict[item]:
        file_data_2[idx] = line
        idx += 1

file_data_2 = file_data_2[:idx]

100%|██████████| 4607048/4607048 [00:06<00:00, 662562.51it/s]


In [20]:
file_data = [line+"\n" for line in file_data]
file_data_2 = [line+"\n" for line in file_data_2]

In [21]:
final_source_users = list(set([line[:line.index("\t")] for line in file_data]))
final_source_items = list(set([line[line.index("\t")+1:line.index("\t", line.index("\t")+1)] for line in file_data]))
final_target_users = list(set([line[:line.index("\t")] for line in file_data_2]))
final_target_items = list(set([line[line.index("\t")+1:line.index("\t", line.index("\t")+1)] for line in file_data_2]))

## Write Interaction Data

In [22]:
inter_header = "user_id:token\titem_id:token\trating:float\ttimestamp:float\n"

with open(f"{dataset_name}-{source_name}/{dataset_name}-{source_name}.inter", "w", encoding="utf-8") as f:
    f.write(inter_header)
    f.writelines(file_data)

with open(f"{dataset_name}-{target_name}/{dataset_name}-{target_name}.inter", "w", encoding="utf-8") as f:
    f.write(inter_header)
    f.writelines(file_data_2)

In [None]:
count_overlap = 0

overlap_users = list(set([u for u in final_source_users if u in final_target_users]))
overlap_items = list(set([i for i in final_source_items if i in final_target_items]))

In [None]:
print("source interactions:", len(file_data))
print("# users in source:", len(final_source_users))
print("# items in source:", len(final_source_items))
print("target interactions:", len(file_data_2))
print("# users in target:", len(final_target_users))
print("# items in target:", len(final_target_items))
print()
print("# overlap users:", len(overlap_users))
print("# overlap items:", len(overlap_items))

source interactions: 104080
# users in source: 2329
# items in source: 4883
target interactions: 120985
# users in target: 2346
# items in target: 4929

# overlap users: 816
# overlap items: 0


# Item Info Collection

In [None]:
source_item_dict = {}

for line in tqdm(source_file):
    source_item_dict[line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]] = False

for item in final_source_items:
    source_item_dict[item] = True

target_item_dict = {}

for line in tqdm(target_file):
    target_item_dict[line[line.index("\t")+1:line.index("\t", line.index("\t")+1)]] = False

for item in final_target_items:
    target_item_dict[item] = True

100%|██████████| 22507156/22507156 [00:18<00:00, 1190714.35it/s]
100%|██████████| 4607048/4607048 [00:03<00:00, 1249095.41it/s]


In [None]:
source_item_info_file = []

with open(f"amazon-{source_name}/amazon-{source_name}.item", "r", encoding="utf-8") as f:
    source_item_info_file = f.read().split("\n")[:-1]

In [None]:
target_item_info_file = []

with open(f"amazon-{target_name}/amazon-{target_name}.item", "r", encoding="utf-8") as f:
    target_item_info_file = f.read().split("\n")[:-1]

In [None]:
file_info = [None for _ in range(number_of_source_items)]
i = 0

for line in tqdm(source_item_info_file):
    if "\t" not in line or i >= number_of_source_items:
        continue
    
    item = line[:line.index("\t")]

    try:
        if source_item_dict[item]:
            file_info[i] = line+"\n"
            i += 1
    except:
        source_item_dict[item] = False

file_info = file_info[:i]

100%|██████████| 2370606/2370606 [00:02<00:00, 898469.35it/s]


In [None]:
file_info_2 = [None for _ in range(number_of_target_items)]
i = 0

for line in tqdm(target_item_info_file):
    if "\t" not in line:
        continue

    item = line[:line.index("\t")]

    try:
        if target_item_dict[item]:
            file_info_2[i] = line+"\n"
            i += 1
    except:
        target_item_dict[item] = False

file_info_2 = file_info_2[:i]

100%|██████████| 208329/208329 [00:00<00:00, 1035623.81it/s]


## Write Item Data

In [None]:
source_item_header = source_item_info_file[0]
target_item_header = target_item_info_file[0]

with open(f"{dataset_name}-{source_name}/{dataset_name}-{source_name}.item", "w", encoding="utf-8") as f:
    f.write(source_item_header)
    f.writelines(file_info)

with open(f"{dataset_name}-{target_name}/{dataset_name}-{target_name}.item", "w", encoding="utf-8") as f:
    f.write(target_item_header)
    f.writelines(file_info_2)

# User ID Embedding

In [None]:
import torch
import torch.nn as nn

In [None]:
num_source_users = len(final_source_users)
num_target_users = len(final_target_users)
user_id_embed_size = 8

In [None]:
final_source_users_dict = {u: i for i, u in enumerate(final_source_users)}
final_target_users_dict = {u: i for i, u in enumerate(final_target_users)}

In [None]:
source_user_embedding = nn.Embedding(num_source_users, user_id_embed_size)
source_user_tensors = torch.LongTensor([[final_source_users_dict[i]] for i in final_source_users])
source_user_embedding_outputs = source_user_embedding(source_user_tensors)

source_user_embedding_dict = {user: str(list(source_user_embedding_outputs[i].detach().numpy()[0])) for i, user in enumerate(final_source_users)}

In [None]:
target_user_embedding = nn.Embedding(num_target_users, user_id_embed_size)
target_user_tensors = torch.LongTensor([[final_target_users_dict[i]] for i in final_target_users])
target_user_embedding_outputs = target_user_embedding(target_user_tensors)

target_user_embedding_dict = {user: str(list(target_user_embedding_outputs[i].detach().numpy()[0])) for i, user in enumerate(final_target_users)}

## Item ID Embedding

In [None]:
num_source_items = len(final_source_items)
num_target_items = len(final_target_items)
item_id_embed_size = 8

In [None]:
final_source_items_dict = {u: i for i, u in enumerate(final_source_items)}
final_target_items_dict = {u: i for i, u in enumerate(final_target_items)}

In [None]:
source_item_embedding = nn.Embedding(num_source_items, item_id_embed_size)
source_item_tensors = torch.LongTensor([[final_source_items_dict[i]] for i in final_source_items])
source_item_embedding_outputs = source_item_embedding(source_item_tensors)

source_item_embedding_dict = {item: str(list(source_item_embedding_outputs[i].detach().numpy()[0])) for i, item in enumerate(final_source_items)}

In [None]:
target_item_embedding = nn.Embedding(num_target_items, item_id_embed_size)
target_item_tensors = torch.LongTensor([[final_target_items_dict[i]] for i in final_target_items])
target_item_embedding_outputs = target_item_embedding(target_item_tensors)

target_item_embedding_dict = {item: str(list(target_item_embedding_outputs[i].detach().numpy()[0])) for i, item in enumerate(final_target_items)}

# Item Feature Embedding

In [None]:
item_feature_embed_size = 96

In [None]:
import pandas as pd

source_item_info_df = pd.DataFrame([line.split("\t") for line in file_info], list(range(len(file_info))), [header[:header.index(":")] for header in source_item_header[:-1].split("\t")])
target_item_info_df = pd.DataFrame([line.split("\t") for line in file_info_2], list(range(len(file_info_2))), [header[:header.index(":")] for header in target_item_header[:-1].split("\t")])

In [None]:
for column in source_item_info_df.columns:
    print(column, '\t', sum((source_item_info_df[column].values == '')))

item_id 	 0
sales_type 	 22
sales_rank 	 22
categories 	 0
title 	 3
price 	 3
brand 	 0


In [None]:
for column in target_item_info_df.columns:
    print(column, '\t', sum((target_item_info_df[column].values == '')))

item_id 	 0
categories 	 0
title 	 7
price 	 141
sales_type 	 15
sales_rank 	 15
brand 	 0


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
raw_source_embeds = [None for _ in range(len(final_source_items))]

for i in tqdm(range(len(final_source_items))):
    text = source_item_info_df['title'].iloc[i]
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = last_hidden_states[:, 0, :]

    raw_source_embeds[i] = sentence_embedding.detach().numpy()[0]

raw_source_embeds = np.array(raw_source_embeds)

100%|██████████| 4883/4883 [05:53<00:00, 13.82it/s]


In [None]:
raw_target_embeds = [None for _ in range(len(final_target_items))]

for i in tqdm(range(len(final_target_items))):
    text = target_item_info_df['title'].iloc[i]
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = last_hidden_states[:, 0, :]

    raw_target_embeds[i] = sentence_embedding.detach().numpy()[0]

raw_target_embeds = np.array(raw_target_embeds)

100%|██████████| 4929/4929 [05:31<00:00, 14.88it/s]


In [None]:
from sklearn.decomposition import PCA

source_pca = PCA(n_components=item_feature_embed_size)
target_pca = PCA(n_components=item_feature_embed_size)

source_embeds = source_pca.fit_transform(raw_source_embeds)
target_embeds = target_pca.fit_transform(raw_target_embeds)

In [None]:
target_embeds[:10]

array([[ 1.30776033e-01, -1.23344612e+00,  1.31465328e+00,
        -1.31461346e+00,  6.77446485e-01, -2.65696645e-01,
         2.73569196e-01,  5.26296556e-01,  5.19613981e-01,
        -1.65825672e-02,  7.97276437e-01, -9.14452076e-01,
        -4.12098831e-03, -5.46520688e-02,  4.24890310e-01,
         5.69931090e-01, -9.08495486e-02,  9.77629662e-01,
         5.19261718e-01,  3.50735843e-01,  1.60124972e-01,
         1.25519171e-01,  3.25486183e-01, -1.80529058e-01,
        -5.40410399e-01,  1.39987931e-01,  4.96246517e-01,
        -1.35664284e-01, -5.46946049e-01,  1.02850057e-01,
        -2.72000641e-01,  2.27551579e-01,  6.00183941e-02,
         1.20579243e-01, -2.29417413e-01, -3.31028283e-01,
         1.85851052e-01, -1.29347965e-01,  9.39319193e-01,
         7.32162893e-02, -2.77887434e-01,  3.09895366e-01,
         2.95598537e-01,  3.59506756e-01, -1.02152152e-03,
         1.14432752e-01, -5.49303293e-02,  4.01810735e-01,
         2.51524687e-01, -3.61168623e-01,  5.70712320e-0

In [None]:
source_embed_strings = [str(list(embed)) for embed in source_embeds]
target_embed_strings = [str(list(embed)) for embed in target_embeds]

In [None]:
source_item_info_df["embedding"] = source_embed_strings
target_item_info_df["embedding"] = target_embed_strings

In [None]:
target_item_info_df["embedding"].iloc[10]

'[-0.79790837, 1.1822076, -2.046185, 0.19508852, -0.46135, -1.1716177, -0.11049411, 1.1119266, -0.78546935, 0.25084594, 0.9832248, -0.15839067, -1.070212, -0.40561548, 0.073240094, 1.3139889, -0.58082503, -0.6962951, -1.0285282, -0.09831694, -0.30196044, -0.16789795, -0.44542292, -0.26558676, 0.35197935, -0.14041618, 0.8943763, 0.7828354, 1.5009838, 0.15514252, -0.031917065, 0.7771328, -0.16944723, -0.06827194, 0.15999149, -0.15538998, 0.19089337, 0.05703289, -0.24208161, -0.29998136, 0.2883188, 0.73559856, 0.21446285, -0.22850575, -0.67184657, -0.5077956, 0.13192934, 0.19091752, 0.21380551, -0.5031922, -0.32895693, -0.6337297, -0.24552242, -0.24258648, 0.34703502, -0.19248514, -0.40397152, 0.13388415, -0.889413, 0.36752573, 0.43154228, 0.19985655, -0.07477614, -0.38727483, -0.11246091, 0.0701897, -0.033646878, 0.03562703, -0.06825423, -0.22877163, 0.114732936, 0.59362787, 0.11627021, -0.65211415, 0.6727902, 0.03391691, -0.28423858, 0.6802506, -0.060859006, -0.4035406, 0.075534135, -0.

In [None]:
final_source_items_dict = {item: idx for idx, item in enumerate(final_source_items)}
final_target_items_dict = {item: idx for idx, item in enumerate(final_target_items)}

# Transform Interaction Data

In [None]:
# source_inter_csv = [None for _ in range(len(file_data))]

# for i, line in tqdm(enumerate(file_data)):
#     user, item, rating, timestamp = line[:-1].split("\t")

#     source_inter_csv[i] = [str(final_source_users_dict[user]), str(final_source_items_dict[item]), str(int(float(rating))), f'\"{source_user_embedding_dict[user]}\"', f'\"{source_item_embedding_dict[item] + source_item_info_df[source_item_info_df["item_id"] == item]["embedding"].iloc[0]}\"']

In [None]:
# target_inter_csv = [None for _ in range(len(file_data_2))]

# for i, line in tqdm(enumerate(file_data_2)):
#     user, item, rating, timestamp = line[:-1].split("\t")

#     target_inter_csv[i] = [str(final_target_users_dict[user]), str(final_target_items_dict[item]), str(int(float(rating))), f'\"{target_user_embedding_dict[user]}\"', f'\"{target_item_embedding_dict[item] + target_item_info_df[target_item_info_df["item_id"] == item]["embedding"].iloc[0]}\"']

In [None]:
# DDTCDR_header = "userId,itemId,rating,user_embedding,item_embedding\n"

# with open("experiment/book.csv", "w", encoding="utf-8") as f:
#     f.write(DDTCDR_header)
    
#     for i in range(len(source_inter_csv)):
#         f.write(",".join(source_inter_csv[i])+"\n")

# with open("experiment/movie.csv", "w", encoding="utf-8") as f:
#     f.write(DDTCDR_header)
    
#     for i in range(len(target_inter_csv)):
#         f.write(",".join(target_inter_csv[i])+"\n")

# Item-Only Embedding

In [None]:
source_item_feature_vectors = [None for _ in final_source_items]
target_item_feature_vectors  = [None for _ in final_target_items]

for i, item in enumerate(final_source_items):
    source_item_feature_vectors[i] = [item, source_item_info_df[source_item_info_df["item_id"] == item]["embedding"].iloc[0]]

for i, item in enumerate(final_target_items):
    target_item_feature_vectors[i] = [item, target_item_info_df[target_item_info_df["item_id"] == item]["embedding"].iloc[0]]


In [None]:
vector_header = "item_id\tembedding\n"

with open(f"{dataset_name}-{source_name}/{dataset_name}-{source_name}.embed", "w", encoding="utf-8") as f:
    f.write(vector_header)
    
    for i in range(len(source_item_feature_vectors)):
        f.write("\t".join(source_item_feature_vectors[i])+"\n")

with open(f"{dataset_name}-{target_name}/{dataset_name}-{target_name}.embed", "w", encoding="utf-8") as f:
    f.write(vector_header)

    for i in range(len(target_item_feature_vectors)):
        f.write("\t".join(target_item_feature_vectors[i])+"\n")