In [1]:
import pickle, csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict, Counter
%matplotlib inline
import torch

import collections
import networkx as nx
from itertools import combinations
import utils
import seaborn

In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
items_set=pickle.load(open('../data/processed_data/item_set','rb'))  # 2819 Games (that appear in >= 1 bundle)
bundle_item_map=pickle.load(open('../data/processed_data/bundle_item_map','rb'))  # 615 bundles
user_bundle_map=pickle.load(open('../data/processed_data/user_bundle_map','rb'))  # 29634 Users who purchased >= 1 bundle
user_item_map=pickle.load(open('../data/processed_data/user_item_map','rb'))  # 29634 Users who purchased >= 1 bundle

# metadata
user_id_lookup=pickle.load(open('../data/processed_data/user_id_lookup','rb'))
bundle_diversity_map=pickle.load(open('../data/processed_data/bundle_diversity_map','rb'))
bundle_price_map = pickle.load(open('../data/processed_data/bundle_price_map','rb'))
item_data=pickle.load(open('../data/processed_data/all_items','rb'))
item_id_lookup = pickle.load(open('../data/processed_data/item_id_lookup','rb'))  # 2819
item_name_map=pickle.load(open('../data/processed_data/item_name_map','rb'))  # 2819
bundle_discount_map = pickle.load(open('../data/processed_data/bundle_discount_map','rb'))
# all_items = pickle.load(open('../data/processed_data/all_items','rb'))

### Generate node features

In [32]:
import json, ast
items_id_set = item_id_lookup.values()
cnt = 0
bad_cnt = 0
item_id_to_review_feat = {i: np.zeros(3) for i in items_id_set}
with open('../raw_data/australian_user_reviews.json') as f:
    for line in f:
        data = ast.literal_eval(line)
        for review in data['reviews']:
            item_id = int(review['item_id'])
            if item_id in items_id_set:
                cnt += 1
                item_id_to_review_feat[item_id] += utils.extract_feat_from_review(review)
            else:
                bad_cnt += 1

In [33]:
cnt = 0
genre_set = set()
sentiment_set = set()
max_genre_cnt = 0
with open('../raw_data/steam_games.json') as f:
    for line in f:
        game = ast.literal_eval(line)
        if 'id' not in game:
            continue
        if int(game['id']) in items_id_set:
            cnt += 1
            if 'genres' in game:
                for g in game['genres']:
                    genre_set.add(g)
                max_genre_cnt = max(max_genre_cnt, len(game['genres']))
            if 'sentiment' in game:
                sentiment_set.add(game['sentiment'])

In [34]:
genres = list(genre_set)
sentiment = list(sentiment_set)
genre_to_genre_id = {v: genres.index(v)+1 for v in genre_set}  # 0 is reserved for abstain
sentiment_to_sent_id = {v: sentiment.index(v)+1 for v in sentiment_set}

In [35]:
item_id_to_meta_feat = {i: np.zeros(10) for i in items_id_set}
with open('../raw_data/steam_games.json') as f:
    for line in f:
        game = ast.literal_eval(line)
        if 'id' not in game:
            continue
        item_id = int(game['id'])
        if item_id in items_id_set:
            if 'genres' in game:
                i = 0
                for g in game['genres']:
                    item_id_to_meta_feat[item_id][i] = genre_to_genre_id[g]
                    i += 1
            if 'sentiment' in game:
                item_id_to_meta_feat[item_id][9] = sentiment_to_sent_id[game['sentiment']]

In [36]:
def merge_feat(item_id_to_meta_feat, item_id_to_review_feat, item_id_lookup):
    item_features = {i: np.zeros(13) for i in item_id_lookup.keys()}
    for i, item_id in item_id_lookup.iteritems():
#         for item_id in items_id_set:
        if item_id in item_id_to_meta_feat:
            item_features[i][:10] = item_id_to_meta_feat[item_id]
        if item_id in item_id_to_review_feat:
            item_features[i][10:] = item_id_to_review_feat[item_id]
    return item_features

In [37]:
item_features = merge_feat(item_id_to_meta_feat, item_id_to_review_feat, item_id_lookup)

In [38]:
# np.save('item_features', item_features)

for bundle 258 and 452, the item lists are empty. It doesn't matter, because no user purchased these two!

## Prepare Data For GCN

In [76]:
S = 700  # number of fake bundles
N = len(items_set)  # 0,1,...,2818
M = len(bundle_item_map) # 0, 1, ..., 614

# get items that only appear once
items_appearance_cnt = Counter()
for b in range(M):
    i_in_b = bundle_item_map[b]
    for i in i_in_b:
        items_appearance_cnt[i] += 1
one_appearance = set([i for i in items_appearance_cnt if items_appearance_cnt[i] == 1])
assert len(one_appearance) == 2245, len(one_appearance)

for i in range(S):
    if len(one_appearance) > 1:
        random_items = set(np.random.choice(list(one_appearance), 
                                            np.random.randint(1, min(len(one_appearance), 7)), replace=False))
    elif len(one_appearance) == 1:
        random_items = set(np.random.choice(list(items_set), 
                                            np.random.randint(2, 7), replace=False))
        random_items.add(list(one_appearance)[0])
    else:
        random_items = set(np.random.choice(list(items_set), 
                                            np.random.randint(2, 7), replace=False))
    bundle_item_map[615+i] = random_items
    one_appearance -= random_items
    
assert max(bundle_item_map.keys()) == 615 + S - 1, max(bundle_item_map.keys())

V_star = range(M+S)
E_star = []
for i in range(N):
    hlink_star = set()
    for j in range(M+S):
        if i in bundle_item_map[j]:
            hlink_star.add(j)
    E_star.append(hlink_star)

In [77]:
Counter(map(len, E_star))  # 2245 items were only included in 1 bundle.

Counter({2: 2416, 3: 343, 4: 49, 5: 8, 6: 2, 7: 1})

In [78]:
def transform_hlink(node_set):
    reg_edges = []
    for edge in combinations(node_set, 2):
        reg_edges.append({edge[0], edge[1]})
    return reg_edges

In [80]:
def get_adj_list(V, E):
    adj_list = {v: set() for v in V}
    for e in E:
        e = list(e)
        if len(e) == 1:
#             adj_list[e[0]].add(e[0])
#             continue
            raise Exception()
        elif len(e) == 2:
            adj_list[e[0]].add(e[1])
            adj_list[e[1]].add(e[0])
        else:
            raise Exception()
    return adj_list

In [81]:
def get_feat_data(v_star_set, v_features):
    features = np.zeros((len(v_star_set), 13))
    for v_star in range(len(v_star_set)):
        feat = np.zeros(13)
        for item in bundle_item_map[v_star]:
            feat += v_features[item]
        if len(bundle_item_map[v_star]) > 0:
            feat /= len(bundle_item_map[v_star])
        features[v_star] = feat / 300.
    return features

In [82]:
E_star_reg = []
for e in E_star:
    if len(e) > 2:
        E_star_reg += transform_hlink(e)
    else:
        E_star_reg.append(e)

In [87]:
class_map = {
#     v: np.random.randint(0, 2) for v in range(M)
    v: 1 for v in range(M)
}
sum(class_map.values())
for i in range(M, M+S):
    class_map[i] = 0

In [88]:
feat_data = get_feat_data(V_star, item_features)
labels_steam = np.array(class_map.values()).reshape(-1, 1)
adj_list_steam = get_adj_list(V_star, E_star_reg)
np.isnan(labels_steam).any(), np.isnan(feat_data).any()

In [89]:
feat_data.shape, len(adj_list_steam)

((1315, 13), 1315)

In [90]:
pickle.dump(labels_steam, open('labels_steam.p', 'w'))
pickle.dump(feat_data, open('feat_data_steam.p', 'w'))
pickle.dump(adj_list_steam, open('adj_list_steam.p', 'w'))

In [92]:
feat_data.max(), feat_data.min()

(3.1416666666666666, 0.0)

In [106]:
x = torch.tensor(feat_data, dtype=torch.float)
y = torch.tensor(labels_steam.flatten(), dtype=torch.long)

edge_idx = []
for node, adj in adj_list_steam.iteritems():
    for neighbor in adj:
        edge_idx.append([node, neighbor])
edge_idx = torch.tensor(edge_idx).T

np.save(open('gnn_x', 'wb'), x)
np.save(open('gnn_y', 'wb'), y)
np.save(open('gnn_edge', 'wb'), edge_idx)

## Prepare Data For CMM

In [124]:
with open('cmm_bundle_items.txt', 'w') as f:
    for bundle, items in bundle_item_map.iteritems():
        items = list(map(str, items))
        f.write('b'+str(bundle)+':'+','.join(items)+'\n')  # bundle id to game id

In [42]:
with open('snap_edge_list.txt', 'w') as f:
    for bundle, items in bundle_item_map.iteritems():
        for item in items:
            f.write(str(bundle+3000)+'\t'+str(item)+'\n')  # bundle id to game id