In [76]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import minmax_scale

import time
import warnings
warnings.filterwarnings('ignore')

In [112]:
# Read in data
def loadData(filename,path="ml-100k/"):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        for line in f:
            (user,movieid,rating,ts)=line.split('\t')
            data.append({ "user_id": str(user), "movie_id": str(movieid)})
            y.append(float(rating))
            users.add(user)
            items.add(movieid)

    return (data, np.array(y), users, items)

(train_data, y_train, train_users, train_items) = loadData("ua.base")
(test_data, y_test, test_users, test_items) = loadData("ua.test")

In [118]:
#data properties
print('samples: ', len(train_data))
print(train_data[0], y_train[0])
print('label\'s categories: ', set(y_train))
'''
click_through rate형태로 변형하기 위해서
별점 5는 클릭할 것이다 = 1
별점 1,2,3,4은 클릭하지 않을 것이다 =-1
'''
y_train[y_train <5]=-1 
y_train[y_train ==5]=1 

y_test[y_test <5]=-1 
y_test[y_test ==5]=1 
print('label\'s categories: ', set(y_train))

samples:  90570
{'user_id': '1', 'movie_id': '1'} 5.0
label's categories:  {1.0, 2.0, 3.0, 4.0, 5.0}
label's categories:  {1.0, -1.0}


In [120]:
print('[train] number of +1/number of y : ', (y_train==1).sum()/len(y_train))
print('[test] number of +1/number of y : ', (y_test==1).sum()/len(y_test))


[train] number of +1/number of y :  0.2103124654963012
[test] number of +1/number of y :  0.22831389183457051


In [121]:
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

#X_train 
X_train.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# 1. pyFM

In [122]:
from pyfm import pylibfm

# check time
start_time = time.time()

# Build and train a Factorization Machine
pyFM = pylibfm.FM(num_factors=10, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.001, learning_rate_schedule="optimal")

pyFM.fit(X_train,y_train)

# check time
print("--- %s seconds ---" % (time.time() - start_time))

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 0.51638
-- Epoch 2
Training log loss: 0.50573
-- Epoch 3
Training log loss: 0.50028
-- Epoch 4
Training log loss: 0.49527
-- Epoch 5
Training log loss: 0.49068
-- Epoch 6
Training log loss: 0.48649
-- Epoch 7
Training log loss: 0.48264
-- Epoch 8
Training log loss: 0.47907
-- Epoch 9
Training log loss: 0.47578
-- Epoch 10
Training log loss: 0.47270
--- 28.40037512779236 seconds ---


In [126]:
# prediction
preds = pyFM.predict(X_test)
preds = minmax_scale(preds)

In [127]:
preds.mean()

0.2649964419563346

In [128]:
#classification
preds[preds < 0.5] = -1
preds[preds >= 0.5] = 1

print('acc:', accuracy_score(y_test, preds))

acc: 0.7782608695652173


In [136]:
(preds==1).sum()/len(preds)

0.05790031813361612

# 2. fast FM

In [173]:
from fastFM import sgd

# check time
start_time = time.time()

#fastFM = sgd.FMClassification(n_iter=1000, init_stdev=0.1, rank=2, step_size=0.02)
fastFM = sgd.FMClassification(n_iter=1000, init_stdev=0.1, l2_reg_w=0,\
                          l2_reg_V=0, rank=2, step_size=0.1)
fastFM.fit(X_train, y_train)

preds_fm = fastFM.predict(X_test)

print('acc:', accuracy_score(y_test, preds_fm))
# check time
print("--- %s seconds ---" % (time.time() - start_time))

acc: 0.7716861081654295
--- 0.02376389503479004 seconds ---


In [174]:
(y_test==-1).sum(), (preds_fm==1).sum()
# 클래스 불균형 문제가 발생.

(7277, 0)

In [175]:
# probability 형태로도 예측 가능
pred_proba = fastFM.predict_proba(X_test)
pred_proba = minmax_scale(Ypred_proba)

#classification
alpha = 0.5
pred_proba[pred_proba < alpha] = -1
pred_proba[pred_proba >= alpha] = 1

print('acc:', accuracy_score(y_test, pred_proba))
print([(y_test==-1).sum(), (pred_proba==1).sum()])
# 클래스 불균형 문제가 발생은 안함.

acc: 0.6453870625662779
[7277, 2281]


# 3. SVM polynomial kernel

In [51]:
from sklearn.svm import SVC

In [143]:
# check time
start_time = time.time()

clf = SVC(C=1.0, kernel='poly', degree=2, gamma = 'auto')
classifier = clf.fit(X_train,y_train)

preds_SVC = classifier.predict(X_test)
#print(y_pred, y_pred_proba)
print('acc:', accuracy_score(y_test, preds_SVC))

# check time
print("--- %s seconds ---" % (time.time() - start_time))

acc: 0.7716861081654295
--- 81.49648785591125 seconds ---


In [145]:
(preds_SVC==1).sum()

0

- classification으로 하니 클래스 불균형 문제 발생.
    - click 정보가 압도적으로 적다보니 ....

- fastFM은 0.0249초 걸림.
    - parameter에 의존성이 좀 있음
- SVC는 81.496초가 걸림.
    - parameter 설정할게 없고, degree만 정해주면 최상의 값 찾아줌
- accuracy는 0.057대로 동일하지만 fast FM이 압도적으로 빠름