In [4]:
import pandas as pd 
import numpy as np

In [5]:
df = pd.read_csv('usersha1-artmbid-artname-plays.tsv', sep='\t')
df.columns = ['user','item','artist','plays']
df.dropna(inplace=True)
df = df.loc[df.plays != 0]

In [6]:
df.reset_index(inplace=True)

In [7]:
df.drop('index', axis=1, inplace=True)

In [8]:
# 오기입된 데이터 처리 
df.tail(50)

Unnamed: 0,user,item,artist,plays
17309265,ffff9ef87a7d9494ada2f9ade4b9ff637c0759ac,4d7928cd-7ed2-4282-8c29-c0c9f966f1bd,alice cooper,59
17309266,ffff9ef87a7d9494ada2f9ade4b9ff637c0759ac,298909e4-ebcb-47b8-95e9-cc53b087fc65,lamb of god,58
17309267,ffff9ef87a7d9494ada2f9ade4b9ff637c0759ac,ef58d4c9-0d40-42ba-bfab-9186c1483edd,dragonforce,57
17309268,"sep 20, 2008",ada7a83c-e3e1-40f1-93f9-3e73dbc9298a,arctic monkeys,158
17309269,"sep 20, 2008",e3e4e534-30cd-4979-bcc5-a75c49802f57,late of the pier,131
17309270,"sep 20, 2008",6bbd8596-7388-456b-aa6a-db5d52d1bcd6,bonaparte,93
17309271,"sep 20, 2008",e48995e0-6ff9-43ba-b942-9f0875fcc712,1000 robota,54
17309272,"sep 20, 2008",6a65d878-fcd0-42cf-aff9-ca1d636a8bcc,foals,54
17309273,"sep 20, 2008",b3d6ae73-8c0c-4227-a069-5ee3a77301e6,bromheads jacket,51
17309274,"sep 20, 2008",0ae49abe-d6af-44fa-8ab0-b9ace5690e6f,bombay bicycle club,46


In [9]:
df.drop(labels = range(17309268,17309315), axis=0, inplace=True)

In [10]:
print('User의 수: ', len(np.unique(list(df['user']))))
print("Artist의 수 : ", len(np.unique(list(df['item']))))

MemoryError: Unable to allocate 2.58 GiB for an array with shape (17309268,) and data type <U40

In [None]:
# 분석데이터 준비

def prepare_analy_dataset(df):
    """
    데이터 로드 함수 
    
    uids: train user
    iids: train item
    users: 전체 user          
    items: 전체 item
    df_train: train data
    df_test: test data
    """
    # user 10000명 샘플링 
    unique_user_lst = list(np.unique(df['user'])) #358857명 
    sample_user_idx = np.random.choice(len(unique_user_lst), 10000, replace=False)
    sample_user_lst = [unique_user_lst[idx] for idx in sample_user_idx]
    
    df = df[df['user'].isin(sample_user_lst)]
    df = df.reset_index(drop=True)

    # 2개 이상의 데이터 정보( row)가 있는 user만 사용 
    df_count = df.groupby(['user']).count()
    df['count'] = df.groupby('user')['user'].transform('count') # transform은 apply와 유사한 기능
    df = df[df['count'] > 1]

    # user, item 아이디 부여 
    # cat.codes = 자동으로 숫자형 리턴 
    # astype('category').cat.codes =  numerical 데이터로 변경하고 싶은 cat 컬럼을 정한 뒤, 강제 형변환을 시키고 cat.codes를 통해 숫자형 리턴
    # => categorical 데이터를 숫자형으로 변환하는 방법 
    df['user_id'] = df['user'].astype("category").cat.codes 
    df['item_id'] = df['item'].astype("category").cat.codes

    # lookup 테이블 생성 
    item_lookup = df[['item_id', 'item']].drop_duplicates()
    item_lookup['item_id'] = item_lookup.item_id.astype(str)

    # train, test 데이터 생성 
    df = df[['user_id', 'item_id', 'plays']] 
    df_train, df_test = train_test_split(df)

    # 전체 user, item 리스트 생성 
    users = list(np.sort(df.user_id.unique())) 
    items = list(np.sort(df.item_id.unique())) 

    # train user, item 리스트 생성 
    rows = df_train['user_id'].astype(int)   
    cols = df_train['item_id'].astype(int)
    values = list(df_train.plays) 
    
    uids = np.array(rows.tolist())
    iids = np.array(cols.tolist())

    # 각 user마다 negative item 생성 
    df_neg = get_negatives(uids, iids, items, df_test)

    return uids, iids, df_train, df_test, df_neg, users, items, item_lookup

def get_negatives(uids, iids, items, df_test):
    """
    negative item 리스트 생성함수
    """
    negativeList = []
    test_u = df_test['user_id'].values.tolist() 
    test_i = df_test['item_id'].values.tolist() 
 
    test_ratings = list(zip(test_u, test_i)) # test (user, item)세트 
    zipped = set(zip(uids, iids))            # train (user, item)세트

    for (u, i) in test_ratings:
        
        negatives = []
        negatives.append((u, i))
        for t in range(100):
            j = np.random.randint(len(items))     # neg_item j 1개 샘플링 
            while (u, j) in zipped:               # j가 train에 있으면 다시뽑고, 없으면 선택 
                j = np.random.randint(len(items)) 
            negatives.append(j)
        negativeList.append(negatives) # [(0,pos), neg, neg, ...]

    df_neg = pd.DataFrame(negativeList)

    return df_neg

def mask_first(x):

    result = np.ones_like(x) 
    result[0] = 0  # [0,1,1,....]
    
    return result

def train_test_split(df):
    """
    train, test 나누는 함수
    """
    df_test = df.copy(deep=True)
    df_train = df.copy(deep=True)
    
    # df_test
    # user_id와 holdout_item_id(user가 플레이한 아이템 중 1개)뽑기 
    df_test = df_test.groupby(['user_id']).first() 
    df_test['user_id'] = df_test.index
    df_test = df_test[['user_id', 'item_id', 'plays']]
    df_test = df_test.reset_index(drop=True)
    
    # df_train 
    # user_id 리스트에 make_first()적용 
    mask = df.groupby(['user_id'])['user_id'].transform(mask_first).astype(bool)
    df_train = df.loc[mask]  

    return df_train, df_test

def get_train_instances(uids, iids, num_neg, num_items):
    """
    모델에 사용할 train 데이터 생성 함수 
    """
    user_input, item_input, labels = [],[],[]
    zipped = set(zip(uids, iids)) # train (user, item) 세트

    for (u, i) in zip(uids, iids):
        
        # pos item 추가 
        user_input.append(u) #[u]
        item_input.append(i) #[pos_i]
        labels.append(1)     #[1]

        # neg item 추가 
        for t in range(num_neg):
            
            j = np.random.randint(num_items)     # neg_item j num_neg(4)개 샘플링
            while (u, j) in zipped:              # u가 j를 이미 선택했다면 
                j = np.random.randint(num_items) # 다시 샘플링 
                
            user_input.append(u) # [u1, u1,  u1,  ...]
            item_input.append(j) # [pos_i, neg_j1, neg_j2, ...]
            labels.append(0)     # [1, 0,  0,  ...]

    return user_input, item_input, labels

In [None]:
uids, iids, df_train, df_test, df_neg, users, items, item_lookup = prepare_analy_dataset(df)

# Train data

In [None]:
df_train.head(10)

Unnamed: 0,user_id,item_id,plays
1,0,28683,726
2,0,32013,687
3,0,34622,609
4,0,46095,583
5,0,48914,581
6,0,20443,573
7,0,29986,521
8,0,9114,471
9,0,10890,438
10,0,12510,419


In [None]:
df_train.shape

(473430, 3)

In [None]:
# model train 데이터
# df_train의 각 row당 negative item을 num neg개씩 랜덤으로 선택
user_input, item_input, labels = get_train_instances(uids, iids, num_neg=4, num_items=len(items))

In [None]:
# 예시 
print('df_train의 첫번째 행: (user_id, item_id)=', (uids[0], iids[0])) 
print('df_train의 두번째 행: (user_id, item_id)=', (uids[1], iids[1])) 

df_train의 첫번째 행: (user_id, item_id)= (0, 28683)
df_train의 두번째 행: (user_id, item_id)= (0, 32013)


In [None]:
for i, (user_id, item_id, label) in enumerate(zip(user_input[0:10], item_input[0:10], labels[0:10])):
    if i==0 or i==5: # num neg를 4개로 설정했으므로 positive 사이 neg가 4개씩 반복됨
        
        print('(user_id, postive_item_id, label):', (user_id, item_id, label))
    else:
        print('(user_id, negative_item_id, label):', (user_id, item_id, label))



(user_id, postive_item_id, label): (0, 28683, 1)
(user_id, negative_item_id, label): (0, 8046, 0)
(user_id, negative_item_id, label): (0, 23547, 0)
(user_id, negative_item_id, label): (0, 7897, 0)
(user_id, negative_item_id, label): (0, 42783, 0)
(user_id, postive_item_id, label): (0, 32013, 1)
(user_id, negative_item_id, label): (0, 39565, 0)
(user_id, negative_item_id, label): (0, 33051, 0)
(user_id, negative_item_id, label): (0, 11717, 0)
(user_id, negative_item_id, label): (0, 45809, 0)


# Test Data
- 각 user별로 user_id와 user가 플레이한 item 1개로 이뤄진다.

In [None]:
df_test.head(10)

Unnamed: 0,user_id,item_id,plays
0,0,44628,748
1,1,676,2055
2,2,36603,3872
3,3,27315,465
4,4,6009,30
5,5,9114,3083
6,6,12992,336
7,7,38098,831
8,8,13386,1537
9,9,13540,964


In [None]:
df_test.shape

(9998, 3)

# Df_neg 데이터
- 각 user별로 negative item (user가 플레이하지않은 item 100개를 랜덤으로 선택한다.)
- column 0 : df_test 데이터의 (user_id, item_id)
- column 1 ~ 100 : negative item
> df_neg 데이터는 모델 평가시 TOP - K metric을 계산할 때 사용된다.

In [None]:
df_neg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,"(0, 44628)",968,23454,31156,46532,18331,184,36945,21340,34367,...,20928,43167,36533,44251,22077,27670,29592,43904,36200,26319
1,"(1, 676)",47614,31197,31025,15016,32888,2856,11330,36514,44514,...,48377,37754,47826,6290,35355,37306,38820,13314,5395,48093
2,"(2, 36603)",14211,3423,49637,48034,17505,3396,3065,34075,12547,...,40937,33774,2291,30277,39246,31413,32417,30936,40442,10853
3,"(3, 27315)",7715,48807,563,14035,49028,287,12252,42432,40292,...,44231,43478,39099,4607,25475,38803,37336,34913,29701,3652
4,"(4, 6009)",42462,41633,7572,41475,14764,9544,11550,15129,43116,...,42971,28351,2923,35045,19875,40502,15852,32822,28965,10431


In [None]:
df_neg.shape

(9998, 101)

# GMF (Generalized matrix factorization)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

class GMF:

    def __init__(self, user_num, item_num):

        latent_features = 8

        # User embedding
        user = Input(shape=(1,), dtype='int32')
        user_embedding = Embedding(user_num, latent_features, input_length=user.shape[1])(user)
        user_embedding = Flatten()(user_embedding)

        # Item embedding
        item = Input(shape=(1,), dtype='int32')
        item_embedding = Embedding(item_num, latent_features, input_length=item.shape[1])(item)
        item_embedding = Flatten()(item_embedding)

        # Merge
        concatenated = Multiply()([user_embedding, item_embedding])

        # Output
        output_layer = Dense(1, kernel_initializer='lecun_uniform', name='output_layer')(concatenated) # 1,1 / h(8,1)초기화

        # Model
        self.model = Model([user, item], output_layer)
        self.model.compile(optimizer='adam', loss='binary_crossentropy')

    def get_model(self):
        model = self.model
        return model


# MLP

In [None]:
import tensorflow as tf 
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

class MLP:
    
    def __init__(self, user_num, item_num):
        
        # User embedding
        user = Input(shape(1,), dtype='int32')
        user_embedding = Embedding(user_num, 32, input_length = item.shape[1])(user)
        user_embedding = Flatten()(user_embedding)
        
        # Item embedding
        item = Input(shape(1,), dtype='int32')
        item_embedding = Embedding(item_num, 32, input_length=item.shape[1])(item)
        item_embedding = Flatten()(item_embedding)

        # Merge
        concatenated = Concatenate()([user_embedding, item_embedding])
        dropout = Dropout(rate=0.2)(concatenated)

        # Layer1
        layer_1 = Dense(units=64, activation='relu', name='layer1')(dropout)  # (64,1)
        dropout1 = Dropout(rate=0.2, name='dropout1')(layer_1)                # (64,1)
        batch_norm1 = BatchNormalization(name='batch_norm1')(dropout1)        # (64,1)

        # Layer2
        layer_2 = Dense(units=32, activation='relu', name='layer2')(batch_norm1)  # (32,1)
        dropout2 = Dropout(rate=0.2, name='dropout2')(layer_2)                    # (32,1)
        batch_norm2 = BatchNormalization(name='batch_norm2')(dropout2)            # (32,1)

        # Layer3
        layer_3 = Dense(units=16, activation='relu', name='layer3')(batch_norm2)  # (16,1)

        # Layer4
        layer_4 = Dense(units=8, activation='relu', name='layer4')(layer_3)  # (8,1)

        # Output
        output_layer = Dense(1, kernel_initializer='lecun_uniform', name='output_layer')(layer_4)  # (1,1) / h(8,1)초기화

        # Model
        self.model = Model([user, item], output_layer)
        self.model.compile(optimizer='adam', loss='binary_crossentropy')

    def get_model(self):
        model = self.model
        return model


# NCF

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

class NeuMF:

    def __init__(self, user_num, item_num):

        latent_features = 8

        # Input
        user = Input(shape=(1,), dtype='int32')
        item = Input(shape=(1,), dtype='int32')

        # User embedding for GMF
        gmf_user_embedding = Embedding(user_num, latent_features, input_length=user.shape[1])(user)
        gmf_user_embedding = Flatten()(gmf_user_embedding)

        # Item embedding for GMF
        gmf_item_embedding = Embedding(item_num, latent_features, input_length=item.shape[1])(item)
        gmf_item_embedding = Flatten()(gmf_item_embedding)

        # User embedding for MLP
        mlp_user_embedding = Embedding(user_num, 32, input_length=user.shape[1])(user)
        mlp_user_embedding = Flatten()(mlp_user_embedding)

        # Item embedding for MLP
        mlp_item_embedding = Embedding(item_num, 32, input_length=item.shape[1])(item)
        mlp_item_embedding = Flatten()(mlp_item_embedding)

        # GMF layers
        gmf_mul =  Multiply()([gmf_user_embedding, gmf_item_embedding])

        # MLP layers
        mlp_concat = Concatenate()([mlp_user_embedding, mlp_item_embedding])
        mlp_dropout = Dropout(0.2)(mlp_concat)

        # Layer1
        mlp_layer_1 = Dense(units=64, activation='relu', name='mlp_layer1')(mlp_dropout)  # (64,1)
        mlp_dropout1 = Dropout(rate=0.2, name='dropout1')(mlp_layer_1)                    # (64,1)
        mlp_batch_norm1 = BatchNormalization(name='batch_norm1')(mlp_dropout1)            # (64,1)

        # Layer2
        mlp_layer_2 = Dense(units=32, activation='relu', name='mlp_layer2')(mlp_batch_norm1)  # (32,1)
        mlp_dropout2 = Dropout(rate=0.2, name='dropout2')(mlp_layer_2)                        # (32,1)
        mlp_batch_norm2 = BatchNormalization(name='batch_norm2')(mlp_dropout2)                # (32,1)

        # Layer3
        mlp_layer_3 = Dense(units=16, activation='relu', name='mlp_layer3')(mlp_batch_norm2)  # (16,1)

        # Layer4
        mlp_layer_4 = Dense(units=8, activation='relu', name='mlp_layer4')(mlp_layer_3)       # (8,1)

        # merge GMF + MLP
        merged_vector = tf.keras.layers.concatenate([gmf_mul, mlp_layer_4])

        # Output layer
        output_layer = Dense(1, kernel_initializer='lecun_uniform', name='output_layer')(merged_vector) # 1,1 / h(8,1)초기화

        # Model
        self.model = Model([user, item], output_layer)
        self.model.compile(optimizer= 'adam', loss= 'binary_crossentropy')

    def get_model(self):
        model = self.model
        return model


In [None]:
nmf = NeuMF(len(users), len(items))
model = nmf.get_model()
model.fit([user_data_shuff, item_data_shuff], label_data_shuff, epochs=20,batch_size=256, verbose=1)

NameError: name 'users' is not defined