In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import faiss
import pickle

def get_sparsity(df):
    print('reviews/ratings:', len(df))
    print('sparsity:', len(df) / ( len(df.AppID.unique())*len(df.UserID.unique()) ))

<font size=5> Matrix Factorization </font>

In [2]:
# 38162 筆資料。882 Users / 485 Apps
df = pd.read_pickle('data/reviews_38162.pkl')
df = df[['AppID', 'UserID', 'Like']]
# df['AppID'] = df['AppID'].astype('str') or 'int32'
df['UserID'] = df['UserID'].astype('int64')
df['Interacted'] = 1
df.head()

Unnamed: 0,AppID,UserID,Like,Interacted
0,730,76561197969379991,1,1
1,730,76561198118543045,0,1
2,730,76561197971801273,0,1
3,730,76561198084359238,1,1
4,730,76561198123845513,0,1


In [None]:
# train_test_split

df = pd.read_pickle('../preprocess/data/reviews_38162.pkl')
df = df[ ['UserID', 'AppID', 'Like'] ]
train_df, test_df = pd.DataFrame(), pd.DataFrame()
train_ratio, test_ration = 0.8, 0.2

for uid in df.UserID.unique():
    single_user_data = df[ df["UserID"] == uid ]
    single_user_data_train = single_user_data[ : int(len(single_user_data)*train_ratio) ]
    single_user_data_test = single_user_data[ int(len(single_user_data)*(train_ratio)) : ]
    train_df = pd.concat([train_df, single_user_data_train], axis=0)
    test_df = pd.concat([test_df, single_user_data_test], axis=0)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
len(train_df), len(test_df), len(train_df)+len(test_df)

train_df.to_pickle('data/train_data.pkl')
test_df.to_pickle('data/test_data.pkl')

In [4]:
# construct rating matrix and interaction matrix
interaction_matrix = df.pivot_table(index='UserID', columns='AppID', values='Interacted').fillna(0)
rating_matrix = df.pivot_table(index='UserID', columns='AppID', values='Like').fillna(0)
trainVector = torch.tensor( rating_matrix.values, dtype=torch.float32 )

100%|██████████| 882/882 [00:00<00:00, 12115.08it/s]
100%|██████████| 882/882 [00:00<00:00, 4355.49it/s]


In [None]:
from sklearn.decomposition import NMF

def matrix_factorization(matrix):
    # n_components is embedding dimension。vervose=1 顯示訓練過程
    model = NMF(n_components=512, init='random', max_iter=500, random_state=25, verbose=0)
    user_embeddings = model.fit_transform(trainVector*matrix.values)
    # 轉成 float32 有兩種寫法
    encoded_user_embeddings = np.asarray(user_embeddings, dtype=np.float32)
    app_embeddings = model.components_.T
    encoded_app_embeddings = np.asarray(app_embeddings.astype('float32'))
    print(encoded_user_embeddings.shape)
    user_id_emb = dict( zip(matrix.index, encoded_user_embeddings) )
    app_id_emb = dict( zip(matrix.columns, encoded_app_embeddings) )

    return user_id_emb, app_id_emb

user_id_emb, app_id_emb = matrix_factorization(rating_matrix)

In [38]:
# save file
with open('data/User/user_id_emb.pkl', 'wb') as f:
    pickle.dump(user_id_emb, f)

with open('data/App/app_id_emb.pkl', 'wb') as f:
    pickle.dump(app_id_emb, f)

<font size=5> 找鄰居 </font>

In [44]:
# 找 user_nbr 的 rating vector
def get_nbrRating(user_id_nbr):
    nbr_rating_list = []
    for uid in user_id_nbr.keys():
        nbr = user_id_nbr[uid]
        nbr_rating = []
        for n_uid in nbr:
            nbr_rating.append(rating_matrix.loc[n_uid].values)
        nbr_rating_list.append( np.sum(nbr_rating, axis=0) )
    
    user_id_nbrRating = dict( zip(user_id_nbr.keys(), nbr_rating_list) )

    return user_id_nbrRating

# 找 user、app 的 k 個鄰居(id)。以及加總 user_nbr 的 rating vector 
def get_nbr(k=6, user_id_emb=None, app_id_emb=None):
    user_id_emb = pd.read_pickle('./data/User/user_id_emb.pkl')
    app_id_emb = pd.read_pickle('./data/App/app_id_emb.pkl')

    user_emb = np.asarray( list(user_id_emb.values()) )
    user_embeddings_index = faiss.IndexIDMap(faiss.IndexFlatIP(user_emb.shape[1]))
    user_embeddings_index.add_with_ids( user_emb, np.array(list(user_id_emb.keys())) )
    app_emb = np.asarray( list(app_id_emb.values()) )
    app_embeddings_index = faiss.IndexIDMap(faiss.IndexFlatIP(app_emb.shape[1]))
    app_embeddings_index.add_with_ids( app_emb, np.array(list(app_id_emb.keys())) )

    # 找出每筆資料的 k neighbors。user_nbr_idx 是 list，每個元素存 k 個鄰居的 user_id
    user_nbr_distances, user_nbr_idx = user_embeddings_index.search(user_emb, k)
    app_nbr_distances, app_nbr_idx = app_embeddings_index.search(app_emb, k)
    # index 0 是自己，需要刪掉
    user_nbr_idx = np.delete(user_nbr_idx, 0, axis=1)
    app_nbr_idx = np.delete(app_nbr_idx, 0, axis=1)
    # user_id_nbr : { user_id : user_nbr_id }。user_nbr_id 的型態是 ndarray，shape=(882, 5)
    user_id_nbr = dict( zip(user_id_emb.keys(), user_nbr_idx) )
    app_id_nbr = dict( zip(app_id_emb.keys(), app_nbr_idx) )
    user_id_nbrRating = get_nbrRating(user_id_nbr)

    return user_id_nbr, app_id_nbr, user_id_nbrRating

user_id_nbr, app_id_nbr, user_id_nbrRating = get_nbr(k=6)

In [45]:
# save file
with open('data/User/user_id_nbr.pkl', 'wb') as f:
    pickle.dump(user_id_nbr, f)

with open('data/App/app_id_nbr.pkl', 'wb') as f:
    pickle.dump(app_id_nbr, f)

with open('data/User/user_id_nbrRating.pkl', 'wb') as f:
    pickle.dump(user_id_nbrRating, f)

<font size=5> 找互動過的使用者、遊戲 </font>

In [7]:
min_user_interacted_apps = min(df['UserID'].value_counts().values)
print('min user interacted apps: ', min_user_interacted_apps)
min_app_interacted_users = min(df['AppID'].value_counts().values)
print('min app interacted users: ', min_app_interacted_users)

min user interacted apps:  30
min app interacted users:  36


In [8]:
# nonzero 函數回傳的是 index。搭配 id_map 取得 user_id、app_id

def get_user_interacted_apps(matrix, k=5):
    app_id_emb = pd.read_pickle('data/App/app_id_emb.pkl')
    app_id_map = { idx:aid for idx,aid in enumerate(matrix.columns) }

    # user_interacted_apps_emb 以字典形式儲存 { user_id: interacted_app_emb }
    interacted_app_ids = []
    interacted_app_emb = []
    for uid in matrix.index:
        # 取出有評論之 app 的索引，再到 app_id_map 取出對應的 app_id
        app_idx = matrix.loc[uid].values.nonzero()[0]
        app_ids = [ app_id_map[idx] for idx in app_idx ]
        # 從 user 互動過的所有 apps 中隨機取 k 個
        app_ids = random.sample(app_ids, k)
        interacted_app_ids.append(app_ids)
        # 存 Embedding
        app_emb = [ app_id_emb[aid] for aid in app_ids ]
        interacted_app_emb.append(app_emb)

    user_interacted_appID = dict(zip( matrix.index, interacted_app_ids ))
    user_interacted_appEmb = dict(zip( matrix.index, interacted_app_emb ))

    return user_interacted_appID, user_interacted_appEmb

def get_app_interacted_users(matrix, k=15):
    user_id_emb = pd.read_pickle('data/User/user_id_emb.pkl')
    user_id_map = { idx:uid for idx,uid in enumerate(matrix.index) }

    # interacted_user_emb 以字典形式儲存 { app_id: interacted_user_emb }
    interacted_user_ids = []
    interacted_user_emb = []
    for aid in matrix.columns:
        # 取出有評論之 user 的索引，再到 user_id_map 取出對應的 user_id
        user_idx = matrix[aid].values.nonzero()[0]
        user_ids = [ user_id_map[idx] for idx in user_idx ]
        # 從與 app 有互動的所有 users 中隨機取 k 個
        user_ids = random.sample(user_ids, k)
        interacted_user_ids.append(user_ids)
        # 存 Embedding
        user_emb = [ user_id_emb[uid] for uid in user_ids ]
        interacted_user_emb.append( user_emb )

    app_interacted_userID = dict( zip( matrix.columns, interacted_user_ids ) )
    app_interacted_usersEmb = dict( zip( matrix.columns, interacted_user_emb ) )

    return app_interacted_userID, app_interacted_usersEmb

# 使用者互動過的遊戲個數、與遊戲有互動的使用者人數
interacted_apps, interacted_users = 30, 35
user_interacted_appID, user_interacted_appEmb = get_user_interacted_apps(interaction_matrix, interacted_apps)
app_interacted_userID, app_interacted_userEmb = get_app_interacted_users(interaction_matrix, interacted_users)

In [14]:
# save file
with open('data/User/user_interacted_appID.pkl', 'wb') as f:
    pickle.dump(user_interacted_appID, f)

with open('data/App/app_interacted_userID.pkl', 'wb') as f:
    pickle.dump(app_interacted_userID, f)

with open('data/User/user_interacted_appEmb.pkl', 'wb') as f:
    pickle.dump(user_interacted_appEmb, f)

with open('data/App/app_interacted_userEmb.pkl', 'wb') as f:
    pickle.dump(app_interacted_userEmb, f)

<font size=5> 評論 </font>

In [32]:
review_df = pd.read_pickle('data/reviews_embedding.pkl')
review_df = review_df[['AppID', 'UserID', 'ReviewEmbedding']]
review_df.tail()

Unnamed: 0,AppID,UserID,ReviewEmbedding
38157,9160,76561197964009814,"[-0.30888498, -0.22816569, 0.16695397, 0.09922..."
38158,9160,76561197977081885,"[-0.058153145, -0.45727348, 0.11072462, 0.0453..."
38159,9160,76561197993676094,"[0.21907169, -0.3302766, 0.37175688, 0.1595228..."
38160,9160,76561198010396848,"[-0.343751, -0.6017795, 0.13571796, 0.17510062..."
38161,9160,76561197973200565,"[0.09694105, -0.22228487, 0.48190293, 0.183993..."


In [60]:
# Get User's Review Vectors、Item's Review Vectors

# User 寫過的評論。user 至少有 30 篇
user_apps = pd.read_pickle('data/User/user_interacted_appID.pkl')
user_group = review_df.groupby('UserID')
min_reviews = min( 30, min(review_df.UserID.value_counts().values) )  # 防錯，後面那串應該30以上
# 對每個 user 隨機取 min_reviews(30) 個數的評論
reviews_emb = [ random.sample( list(user_group.get_group(uid)['ReviewEmbedding'].values[:]), min_reviews ) for uid in user_apps.keys() ]
user_id_ReviewsEmb = dict( zip(user_apps.keys(), reviews_emb) )

# App 擁有的評論。app 至少有 36 篇
app_users = pd.read_pickle('data/App/app_interacted_userID.pkl')
app_group = review_df.groupby('AppID')
min_reviews = min( 30, min(review_df.AppID.value_counts().values) )  # 防錯，後面那串應該36以上
# 對每個 app 隨機取 min_reviews(30) 個數的評論
reviews_emb = [ random.sample( list( app_group.get_group(aid)['ReviewEmbedding'].values[:] ), min_reviews ) for aid in app_users.keys() ]
app_id_ReviewsEmb = dict( zip(app_users.keys(), reviews_emb) )

# save file
with open('data/User/user_id_ReviewsEmb.pkl', 'wb') as f:
    pickle.dump(user_id_ReviewsEmb, f)
with open('data/App/app_id_ReviewsEmb.pkl', 'wb') as f:
    pickle.dump(app_id_ReviewsEmb, f)

In [5]:
# 鄰居數。user/app 各取 5 
# 將每個鄰居寫過的所有評論加總平均，一個鄰居是一個 768 維的向量
def get_user_nbrReviewsEmb(k=5):
    user_nbr = pd.read_pickle('data/User/user_id_nbr.pkl')
    user_revEmb = pd.read_pickle('data/User/user_id_ReviewsEmb.pkl')

    all_user_nbr_revEmb = []
    for uid in user_nbr.keys():
        nbr_ids = user_nbr[uid]
        # 從鄰居寫過的所有評論中隨機取 k 個算平均，每個鄰居以 768 維的向量表示
        # nbr_revEmb = [ np.mean( random.sample(user_revEmb[nuid], k), 0 ) for nuid in nbr_ids ]
        # 對鄰居的所有評論算平均，每個鄰居以 768 維的向量表示
        nbr_revEmb = [ np.mean(user_revEmb[nuid], 0) for nuid in nbr_ids ]
        all_user_nbr_revEmb.append(nbr_revEmb)

    user_id_nbrReviewsEmb = dict( zip(user_nbr.keys(), all_user_nbr_revEmb) )

    return user_id_nbrReviewsEmb

def get_item_nbrReviewsEmb():
    app_nbr = pd.read_pickle('data/App/app_id_nbr.pkl')
    app_revEmb = pd.read_pickle('data/App/app_id_ReviewsEmb.pkl')
    
    all_app_nbr_revEmb = []
    for aid in app_nbr.keys():
        app_ids = app_nbr[aid]
        nbr_revEmb = [ np.mean(app_revEmb[naid], 0) for naid in app_ids ]
        all_app_nbr_revEmb.append(nbr_revEmb)

    app_id_nbrReviewsEmb = dict( zip(app_nbr.keys(), all_app_nbr_revEmb) )

    return app_id_nbrReviewsEmb

# -------------------------------------------------------------------
user_id_nbrReviewsEmb = get_user_nbrReviewsEmb()
app_id_nbrReviewsEmb = get_item_nbrReviewsEmb()

with open('data/User/user_id_nbrReviewsEmb.pkl', 'wb') as f:
    pickle.dump(user_id_nbrReviewsEmb, f)
with open('data/App/app_id_nbrReviewsEmb.pkl', 'wb') as f:
    pickle.dump(app_id_nbrReviewsEmb, f)