In [1]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
%matplotlib inline
import pandas as pd

# https://github.com/technoapurva/Steam-Bundle-Recommendation/tree/master

# Backup
# https://www.kaggle.com/datasets/nikdavis/steam-store-games
# https://cseweb.ucsd.edu/~jmcauley/datasets.html#steam_data

In [2]:
# !wget https://drive.google.com/drive/folders/1c95okO8Xzrzl4I7S8rWkHZA4FTLY4J45?usp=sharing
    
items_set=pickle.load(open('data/processed_data/item_set','rb'))
bundle_item_map=pickle.load(open('data/processed_data/bundle_item_map','rb'))
user_bundle_map=pickle.load(open('data/processed_data/user_bundle_map','rb'))
user_item_map=pickle.load(open('data/processed_data/user_item_map','rb'))
item_data=pickle.load(open('data/processed_data/all_items','rb'))
item_id_lookup = pickle.load(open('data/processed_data/item_id_lookup','rb'))

In [3]:
# Inicializa um dicionário vazio para mapear 'appid' para os dados do item
item_data_map = dict()

# Inicializa um conjunto vazio para coletar tags únicas associadas aos itens
tags_set = set()

# Itera sobre cada item em item_data
for item in item_data:
    # Converte 'appid' para inteiro e mapeia para o item em item_data_map
    item_data_map[int(item['appid'])] = item

    # Itera sobre as tags no item atual e as adiciona a tags_set
    for tag in item['tags']:
        tags_set.add(tag)

# Inicializa um dicionário vazio para mapear tags únicas para índices inteiros
tags_map = dict()

# Atribui índices a cada tag única em tags_set
for i, tag in enumerate(tags_set):
    tags_map[tag] = i

# Define uma função para converter uma lista de tags em uma matriz de características binárias
def get_feat(tags):
    # Inicializa uma matriz de características binárias com zeros
    feat = np.zeros(len(tags_map))

    # Define o índice correspondente como 1 para cada tag na lista de entrada
    for tag in tags:
        feat[tags_map[tag]] = 1

    return feat

# Inicializa uma lista vazia para armazenar dados combinados de usuários e bundles
all_data = []

# Itera sobre os itens no mapeamento de usuário para bundles
for user, bundles in user_bundle_map.items():
    for bundle in bundles:
        # Adiciona uma tupla contendo o usuário e o bundle à lista all_data
        all_data.append((user, bundle))

# Inicializa uma lista vazia para armazenar dados combinados de usuários e itens
all_item_data = []

# Itera sobre os itens no mapeamento de usuário para itens
for user, items in user_item_map.items():
    for item in items:
        # Adiciona uma tupla contendo o usuário e o item à lista all_item_data
        all_item_data.append((user, item))

import random

# Embaralha aleatoriamente a lista de dados combinados de usuários e bundles
random.shuffle(all_data)

# Obtém o tamanho total dos dados após o embaralhamento
data_size = len(all_data)

# Divide os dados embaralhados em conjuntos de treinamento e teste para o modelo BPR para bundles
training_data = all_data[:int(0.8 * data_size)]
test_data = all_data[int(0.8 * data_size):]

# Divide os dados de itens em conjuntos de treinamento e teste para o modelo BPR para itens
training_data_2 = all_item_data[:int(0.8 * len(all_item_data))]
test_data_2 = all_item_data[int(0.8 * len(all_item_data)):]


In [4]:
def check_tuple(tuple_1, tuple_2, user_bundle_map):
    # Verifica se os itens das tuplas não estão presentes nos bundles dos usuários correspondentes
    return tuple_1[1] not in user_bundle_map[tuple_2[0]] and tuple_2[1] not in user_bundle_map[tuple_1[0]]

def graph_sampling(n_samples, training_data, user_bundle_map):
    # Listas para armazenar usuários, itens positivos e itens negativos para amostragem
    sgd_users = []
    sgd_pos_items, sgd_neg_items = [], []
    
    i = 0
    while n_samples > 0:
        if i % 100000 == 0:
            print(i)
        i += 1
        
        # Seleciona duas tuplas aleatórias do conjunto de treinamento
        tuple_1 = training_data[np.random.randint(len(training_data))]
        tuple_2 = training_data[np.random.randint(len(training_data))]
        
        # Limita o número de iterações para evitar loops infinitos
        iteration = 100
        
        # Enquanto as tuplas não atenderem à condição de verificação, escolhe novas tuplas
        while not check_tuple(tuple_1, tuple_2, user_bundle_map):
            tuple_2 = training_data[np.random.randint(len(training_data))]
            iteration -= 1
            
            # Se atingir o limite de iterações, interrompe o loop
            if iteration == 0:
                break
        
        # Se atingir o limite de iterações, continua para a próxima iteração
        if iteration == 0:
            continue   
        
        # Adiciona itens positivos e negativos, assim como usuários correspondentes
        sgd_neg_items.append(tuple_2[1])
        sgd_pos_items.append(tuple_1[1])
        sgd_users.append(tuple_1[0])
        
        sgd_neg_items.append(tuple_1[1])
        sgd_pos_items.append(tuple_2[1])
        sgd_users.append(tuple_2[0])
        
        # Decrementa o número de amostras restantes
        n_samples -= 2
    
    # Retorna as listas resultantes após a amostragem
    return sgd_users, sgd_pos_items, sgd_neg_items

In [5]:
sgd_train_users_items, sgd_train_pos_items, sgd_train_neg_items = graph_sampling(len(training_data_2)*30, training_data_2, user_item_map)


0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
7200000
7300000
7400000
7500000
7600000
7700000
7800000
7900000
8000000
8100000
8200000
8300000
8400000
8500000
8600000
8700000
8800000
8900000
9000000
9100000
9200000
9300000
9400000
9500000
9600000
9700000
9800000
9900000
10000000
10100000
10200000
10300000
10400000
10500000
10600000
10700000
10800000


In [6]:
from collections import defaultdict

def get_test_data_items(test_data, train_data):
    # Listas para armazenar usuários, itens positivos e itens negativos para testes
    users = []
    pos_items = []
    neg_items = []

    # Converte os dados de treinamento e teste em dicionários
    train_dict, train_users, train_items = data_to_dict(train_data)
    test_dict, test_users, test_items = data_to_dict(test_data)

    z = 0
    # Itera sobre os usuários no conjunto de teste
    for i, user in enumerate(test_dict.keys()):
        if (i % 1000 == 0):
            print(i)

        # Verifica se o usuário também está no conjunto de treinamento
        if user in train_users:
            # Itera sobre os itens positivos para o usuário no conjunto de teste
            for pos_item in test_dict[user]:
                # Verifica se o item positivo está no conjunto de treinamento
                if pos_item in train_items:
                    # Itera sobre os itens no conjunto de treinamento para gerar itens negativos
                    for neg_item in train_items:
                        # Verifica se o item negativo não está no conjunto de teste e no conjunto de treinamento
                        if neg_item not in test_dict[user] and neg_item not in train_dict[user]:
                            # Adiciona usuários, itens positivos e itens negativos às listas
                            users.append(user)
                            pos_items.append(pos_item)
                            neg_items.append(neg_item)

    # Retorna as listas resultantes para testes
    return users, pos_items, neg_items

def data_to_dict(data):
    # Converte os dados em um dicionário, onde a chave é o usuário e o valor é uma lista de itens associados a esse usuário
    data_dict = defaultdict(list)
    items = set()
    for (user, item) in data:
        data_dict[user].append(item)
        items.add(item)
    return data_dict, set(data_dict.keys()), items


In [7]:
test_users_cold, test_pos_items_cold, test_neg_items_cold = get_test_data_items(test_data_2, training_data_2)


0
1000
2000
3000
4000
5000
6000
7000
8000


# Novo SVD para Steam

In [33]:
import numpy as np
import pandas as pd

def svdpp(train, n_factors, lr=0.05, reg=0.02, miter=10):
    global_mean = train['playtime'].mean()
    n_users = train['userId'].max() + 1
    n_items = train['gameId'].max() + 1
    bu = np.zeros(n_users)
    bi = np.zeros(n_items)
    p = np.random.normal(0.1, 0.1, (n_users, n_factors))
    q = np.random.normal(0.1, 0.1, (n_items, n_factors))
    Y = np.random.normal(0.1, 0.1, (n_users, n_factors))
    error = []

    for t in range(miter):
        sq_error = 0
        for index, row in train.iterrows():
            u = row['userId']
            i = row['gameId']
            r_ui = row['playtime']
            Nu = train[train['userId'] == u]['gameId'].values
            Y_sum = np.zeros(n_factors)
            for j in Nu:
                Y_sum = Y_sum + Y[j]
            P_plus_y = p[u] + (1 / np.sqrt(len(Nu))) * Y_sum
            pred = global_mean + bu[u] + bi[i] + q[i].T @ P_plus_y
            e_ui = r_ui - pred
            sq_error = sq_error + pow(e_ui, 2)
            bu[u] = bu[u] + lr * e_ui
            bi[i] = bi[i] + lr * e_ui
            for f in range(n_factors):
                p[u][f] = p[u][f] + lr * (e_ui * q[i][f] - reg * p[u][f])
                temp_uf = q[i][f]
                q[i][f] = q[i][f] + lr * (e_ui * P_plus_y[f] - reg * q[i, f])
                for j in Nu:
                    Y[j][f] = Y[j][f] + lr * (e_ui * (1 / np.sqrt(len(Nu)))) * temp_uf - reg * Y[j, f]
        error.append(np.sqrt(sq_error / len(train)))

    def recommend_games(user_id, n_recommendations=10):
        played_games = train[train['userId'] == user_id]['gameId'].values
        predicted_playtimes = global_mean + bu[user_id] + bi + q.T @ (p[user_id] + (1 / np.sqrt(len(played_games))) * Y[played_games].sum(axis=0))
        ranked_games = np.argsort(predicted_playtimes)[::-1]
        recommended_games = [game_id for game_id in ranked_games if game_id not in played_games][:n_recommendations]
        return recommended_games

    return recommend_games, error


# Observed and unobserved data for each user
observed = dict()
unobserved = dict()

for u in user_bundle_map:
    observed[u] = get_item_ids(user_bundle_map[u], u)
    unobserved[u] = list(items_set - set(observed[u]))

# Training the model
item_bias, p, q, error = svdpp(user_bundle_map, items_set, n_factors=10, lr=0.05, reg=0.02, miter=30)

# Making predictions using the trained model
predictions = recommend_games(observed=observed, all_users=list(user_bundle_map.keys()), p=p, q=q, item_bias=item_bias, N=10)


Iter # 0
Iter # 1
Iter # 2
Iter # 3
Iter # 4
Iter # 5
Iter # 6
Iter # 7
Iter # 8
Iter # 9
Iter # 10
Iter # 11
Iter # 12
Iter # 13
Iter # 14
Iter # 15
Iter # 16
Iter # 17
Iter # 18
Iter # 19
Iter # 20
Iter # 21
Iter # 22
Iter # 23
Iter # 24
Iter # 25
Iter # 26
Iter # 27
Iter # 28
Iter # 29


In [None]:
predictions

In [32]:
# import numpy as np
# import pandas as pd

# def svdpp(train, n_factors, lr=0.05, reg=0.02, miter=10):
#     global_mean = train['playtime'].mean()
#     n_users = train['userId'].max() + 1
#     n_items = train['gameId'].max() + 1
#     bu = np.zeros(n_users)
#     bi = np.zeros(n_items)
#     p = np.random.normal(0.1, 0.1, (n_users, n_factors))
#     q = np.random.normal(0.1, 0.1, (n_items, n_factors))
#     Y = np.random.normal(0.1, 0.1, (n_users, n_factors))
#     error = []

#     for t in range(miter):
#         sq_error = 0
#         for index, row in train.iterrows():
#             u = row['userId']
#             i = row['gameId']
#             r_ui = row['playtime']
#             Nu = train[train['userId'] == u]['gameId'].values
#             Y_sum = np.zeros(n_factors)
#             for j in Nu:
#                 Y_sum = Y_sum + Y[j]
#             P_plus_y = p[u] + (1 / np.sqrt(len(Nu))) * Y_sum
#             pred = global_mean + bu[u] + bi[i] + q[i].T @ P_plus_y
#             e_ui = r_ui - pred
#             sq_error = sq_error + pow(e_ui, 2)
#             bu[u] = bu[u] + lr * e_ui
#             bi[i] = bi[i] + lr * e_ui
#             for f in range(n_factors):
#                 p[u][f] = p[u][f] + lr * (e_ui * q[i][f] - reg * p[u][f])
#                 temp_uf = q[i][f]
#                 q[i][f] = q[i][f] + lr * (e_ui * P_plus_y[f] - reg * q[i, f])
#                 for j in Nu:
#                     Y[j][f] = Y[j][f] + lr * (e_ui * (1 / np.sqrt(len(Nu)))) * temp_uf - reg * Y[j, f]
#         error.append(np.sqrt(sq_error / len(train)))

#     def recommend_games(user_id, n_recommendations=10):
#         played_games = train[train['userId'] == user_id]['gameId'].values
#         predicted_playtimes = global_mean + bu[user_id] + bi + q.T @ (p[user_id] + (1 / np.sqrt(len(played_games))) * Y[played_games].sum(axis=0))
#         ranked_games = np.argsort(predicted_playtimes)[::-1]
#         recommended_games = [game_id for game_id in ranked_games if game_id not in played_games][:n_recommendations]
#         return recommended_games

#     return recommend_games, error


# # Observed and unobserved data for each user
# observed = dict()
# unobserved = dict()

# for u in user_bundle_map:
#     observed[u] = get_item_ids(user_bundle_map[u], u)
#     unobserved[u] = list(items_set - set(observed[u]))

# # Training the BPRMF model
# item_bias, p, q, error = svdpp(user_bundle_map, items_set, n_factors=10, lr=0.05, reg=0.02, miter=30)

# # Making predictions using the trained model
# predictions = recommend_games(observed=observed, all_users=list(user_bundle_map.keys()), p=p, q=q, item_bias=item_bias, N=10)


# SVD Original

In [10]:
# def svdpp(train, n_factors, lr=0.05, reg=0.02, miter=10):
#     global_mean = train['rating'].mean()
#     n_users = df['userId'].max()+1
#     n_items = df['movieId'].max()+1
#     bu = np.zeros(n_users)
#     bi = np.zeros(n_items)
#     p = np.random.normal(0.1, 0.1, (n_users, n_factors))
#     q = np.random.normal(0.1, 0.1, (n_items, n_factors))
#     Y = np.random.normal(0.1, 0.1, (n_users, n_factors))
#     error = []
#     for t in range(miter):
#         sq_error = 0
#         for index, row in train.iterrows():
#             u = row['userId']
#             i = row['movieId']
#             r_ui = row['rating']
#             Nu = train[train['userId'] == u]['movieId'].values
#             Y_sum = np.zeros(n_factors)
#             for j in Nu:
#                 Y_sum = Y_sum + Y[j]
#             P_plus_y = p[u] + (1 /np.sqrt(len(Nu))) * Y_sum
#             pred = global_mean + bu[u] + bi[i] + q[i].T @ P_plus_y
#             e_ui = r_ui - pred
#             sq_error = sq_error + pow(e_ui, 2)
#             bu[u] = bu[u] + lr * e_ui
#             bi[i] = bi[i] + lr * e_ui
#             for f in range(n_factors):
#                 p[u][f] = p[u][f] + lr * (e_ui * q[i][f] - reg * p[u][f])
#                 temp_uf = q[i][f]
#                 q[i][f] = q[i][f] + lr * (e_ui * P_plus_y[f] - reg * q[i, f])
#                 for j in Nu:
#                     Y[j][f] = Y[j][f] + lr * (e_ui *(1/np.sqrt(len(Nu)))) * temp_uf - reg * Y[j,f]
#         error.append(np.sqrt(sq_error/len(train)))

#     return global_mean, bu, bi, P_plus_y, p, q, error



# BPRMF

In [29]:
import random
import numpy as np
import pickle
from collections import defaultdict

# Function to get items associated with a user in the training set
def get_item_ids(bundle_items, user):
    return list(bundle_items)

# Function to draw a pair of items (i, j) for a specific user
def draw(userId, observed):
    i = random.choice(observed)
    j = random.choice(list(items_set - set(observed)))
    return i, j

# BPRMF - Training the BPRMF model
def train_bprmf(user_bundle_map, item_set, n_factors, lr=0.05, reg=0.02, miter=30):
    n_users = len(user_bundle_map)
    n_items = len(item_set)
    item_bias = np.zeros(n_items)
    p = np.random.normal(0, 0.1, (n_users, n_factors))
    q = np.random.normal(0, 0.1, (n_items, n_factors))

    error = []
    for t in range(miter):
        print('Iter #', t)
        sq_error = 0
        random_users = random.sample(list(user_bundle_map.keys()), k=len(user_bundle_map))
        for u in random_users:
            observed[u] = get_item_ids(user_bundle_map[u], u)
            i, j = draw(u, observed[u])

            x_uij = item_bias[i] - item_bias[j] + (np.dot(p[u], q[i]) - np.dot(p[u], q[j]))
            sq_error += x_uij

            eps = 1 / (1 + np.exp(x_uij))

            item_bias[i] += lr * (eps - reg * item_bias[i])
            item_bias[j] += lr * (-eps - reg * item_bias[j])

            # Adjust the factors
            u_f = p[u]
            i_f = q[i]
            j_f = q[j]

            # Compute and apply factor updates
            p[u] += lr * ((i_f - j_f) * eps - reg * u_f)
            q[i] += lr * (u_f * eps - reg * i_f)
            q[j] += lr * (-u_f * eps - reg * j_f)

        error.append(sq_error / len(random_users))

    return item_bias, p, q, error

# Function to make predictions using the trained model
def predict(observed, all_users, p, q, item_bias, N=10):
    w = item_bias.T + np.dot(p, q.T)
    ranking = []

    for u, user in enumerate(all_users):
        partial_ranking = list()
        candidate_items = sorted(range(len(w[u])), key=lambda k: w[u][k], reverse=True)

        for i in candidate_items:
            if i not in observed[user]:
                partial_ranking.append((user, i, w[u][i]))

            if len(partial_ranking) == N:
                break

        ranking += partial_ranking

    return ranking

# Observed and unobserved data for each user
observed = dict()
unobserved = dict()

for u in user_bundle_map:
    observed[u] = get_item_ids(user_bundle_map[u], u)
    unobserved[u] = list(items_set - set(observed[u]))

# Training the BPRMF model
item_bias, p, q, error = train_bprmf(user_bundle_map, items_set, n_factors=10, lr=0.05, reg=0.02, miter=30)

# Making predictions using the trained model
predictions = predict(observed=observed, all_users=list(user_bundle_map.keys()), p=p, q=q, item_bias=item_bias, N=10)


Iter # 0
Iter # 1
Iter # 2
Iter # 3
Iter # 4
Iter # 5
Iter # 6
Iter # 7
Iter # 8
Iter # 9
Iter # 10
Iter # 11
Iter # 12
Iter # 13
Iter # 14
Iter # 15
Iter # 16
Iter # 17
Iter # 18
Iter # 19
Iter # 20
Iter # 21
Iter # 22
Iter # 23
Iter # 24
Iter # 25
Iter # 26
Iter # 27
Iter # 28
Iter # 29


In [None]:
# # Passando os dados para um arquivo
# train.to_csv('train.dat', index=False, header=False, sep='\t')
# test.to_csv('test.dat', index=False, header=False, sep='\t')


# observed = dict()
# unobserved = dict()
# all_users = df['userId'].unique().tolist() # usar conj. total
# all_items = df['itemId'].unique().tolist() # usar conj. total

# for u in all_users:
#     observed[u] = get_item_ids(train, u) # usar conj. de treinamento
#     unobserved[u] = list(set(all_items)-set(observed[u]))

# def draw(userId):
#     i = random.choice(observed[userId])
#     j = random.choice(unobserved[userId])
#     return i, j

# # BPRMF
# def train_bprmf(train, n_factors, lr=0.05, reg=0.02, miter=30):
#     n_users = df['userId'].max()+1
#     n_items = df['itemId'].max()+1
#     item_bias = np.zeros(n_items)
#     p = np.random.normal(0, 0.1, (n_users, n_factors))
#     q = np.random.normal(0, 0.1, (n_items, n_factors))

#     error = []
#     for t in range(miter):
#         print('Iter #', t)
#         sq_error = 0
#         random_users = random.choices(train['userId'].unique(), k=len(train))
#         for u in random_users:
#             i, j = draw(u)
#             x_uij = item_bias[i] - item_bias[j] + (np.dot(p[u], q[i]) - np.dot(p[u], q[j]))
#             sq_error += x_uij

#             eps = 1 / (1 + np.exp(x_uij))

#             item_bias[i] += lr * (eps - reg * item_bias[i])
#             item_bias[j] += lr * (-eps - reg * item_bias[j])

#             # Adjust the factors
#             u_f = p[u]
#             i_f = q[i]
#             j_f = q[j]

#             # Compute and apply factor updates
#             p[u] += lr * ((i_f - j_f) * eps - reg * u_f)
#             q[i] += lr * (u_f * eps - reg * i_f)
#             q[j] += lr * (-u_f * eps - reg * j_f)

#         error.append(sq_error/len(random_users))

#     return item_bias, p, q, error

# def predict(N=10):
#     w = b.T + np.dot(p, q.T)
#     ranking = []

#     for u, user in enumerate(all_users):
#         partial_ranking = list()
#         candidate_items = sorted(range(len(w[u])), key=lambda k: w[u][k], reverse=True)

#         for i in candidate_items:
#             if i not in observed[user]:
#                 partial_ranking.append((user, i, w[u][i]))

#             if len(partial_ranking) == N:
#                 break

#         ranking += partial_ranking

#     return pd.DataFrame(ranking, columns=['userId', 'movieId', 'score'])