In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import minmax_scale

import time
import warnings
warnings.filterwarnings('ignore')

In [3]:
from fastFM.datasets import make_user_item_regression

# This sets up a small test dataset.
X, y, _ = make_user_item_regression(n_user=100, n_item=100)

# Convert dataset to binary classification task.
y_labels = np.ones_like(y)
y_labels[y < np.mean(y)] = -1
#print(y_labels)
X_train, X_test, y_train, y_test = train_test_split(X, y_labels)
#print(X_train.toarray())


In [4]:
print('[train] number of +1/number of y : ', (y_train==1).sum()/len(y_train))
print('[test] number of +1/number of y : ', (y_test==1).sum()/len(y_test))


[train] number of +1/number of y :  0.5085333333333333
[test] number of +1/number of y :  0.492


# 1. pyFM

In [6]:
from pyfm import pylibfm

# check time
start_time = time.time()

# Build and train a Factorization Machine
pyFM = pylibfm.FM(num_factors=10, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.001, learning_rate_schedule="optimal")

pyFM.fit(X_train,y_train)

# check time
print("--- %s seconds ---" % (time.time() - start_time))

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 0.68751
-- Epoch 2
Training log loss: 0.67524
-- Epoch 3
Training log loss: 0.66345
-- Epoch 4
Training log loss: 0.65210
-- Epoch 5
Training log loss: 0.64118
-- Epoch 6
Training log loss: 0.63065
-- Epoch 7
Training log loss: 0.62048
-- Epoch 8
Training log loss: 0.61074
-- Epoch 9
Training log loss: 0.60130
-- Epoch 10
Training log loss: 0.59223
--- 2.3165318965911865 seconds ---


In [56]:
for alpha in [0.4,0.5,0.55,0.6,0.65,0.7]:

    # prediction
    preds = pyFM.predict(X_test)
    preds = minmax_scale(preds)

    #classification
    preds[preds < alpha] = -1
    preds[preds >= alpha] = 1

    print('-------------')
    print('alpha: ', alpha)
    print('acc:', accuracy_score(y_test, preds))
    print('raito: ', (preds==1).sum()/len(preds))

-------------
alpha:  0.4
acc: 0.8096
raito:  0.6824
-------------
alpha:  0.5
acc: 0.9456
raito:  0.5032
-------------
alpha:  0.55
acc: 0.9132
raito:  0.41
-------------
alpha:  0.6
acc: 0.8296
raito:  0.3216
-------------
alpha:  0.65
acc: 0.7552
raito:  0.2472
-------------
alpha:  0.7
acc: 0.6848
raito:  0.1768


# 2. fast FM

In [57]:
from fastFM import sgd

# check time
start_time = time.time()

# fastFM = sgd.FMClassification(n_iter=1000, init_stdev=0.1, rank=2, step_size=0.02)
fastFM = sgd.FMClassification(n_iter=100000, init_stdev=0.1, l2_reg_w=0,\
                          l2_reg_V=0, rank=2, step_size=0.1)
fastFM.fit(X_train, y_train)

preds_fm = fastFM.predict(X_test)

print('acc:', accuracy_score(y_test, preds_fm))
print('raito: ', (preds_fm==1).sum()/len(preds_fm))
# check time
print("--- %s seconds ---" % (time.time() - start_time))

acc: 0.9696
raito:  0.496
--- 0.018447160720825195 seconds ---


In [59]:
# cf)
accuracy_score(y_train, fastFM.predict(X_train))

0.9874666666666667

In [58]:
for alpha in [0.4,0.5,0.55,0.6,0.65,0.7]:

    # probability 형태로도 예측 가능
    pred_proba = fastFM.predict_proba(X_test)
    pred_proba = minmax_scale(pred_proba)

    #classification
    pred_proba[pred_proba < alpha] = -1
    pred_proba[pred_proba >= alpha] = 1
    
    print('-------------')
    print('alpha: ', alpha)
    print('acc:', accuracy_score(y_test, pred_proba))
    print('raito: ', (pred_proba==1).sum()/len(pred_proba))

-------------
alpha:  0.4
acc: 0.9696
raito:  0.5024
-------------
alpha:  0.5
acc: 0.9696
raito:  0.496
-------------
alpha:  0.55
acc: 0.972
raito:  0.4928
-------------
alpha:  0.6
acc: 0.9724
raito:  0.4868
-------------
alpha:  0.65
acc: 0.9756
raito:  0.482
-------------
alpha:  0.7
acc: 0.9748
raito:  0.4764


# 3. SVM polynomial kernel

In [29]:
from sklearn.svm import SVC

In [37]:
# check time
start_time = time.time()

clf = SVC(C=1.0, kernel='poly', degree=2, gamma = 2)
classifier = clf.fit(X_train,y_train)

preds_SVC = classifier.predict(X_test)

print('acc:', accuracy_score(y_test, preds_SVC))
print('raito: ', (preds_SVC==1).sum()/len(preds_SVC))

# check time
print("--- %s seconds ---" % (time.time() - start_time))

acc: 0.9664
raito:  0.492
--- 1.160851001739502 seconds ---


- fastFM은 0.018초 걸림.
    - parameter에 의존성이 좀 있음. interation number 등.
- SVC는 1.214초가 걸림.
    - parameter 설정할게 없고, degree만 정해주면 거의 최상의 값 찾아줌
- accuracy는 0.96대로 동일하지만 fast FM이 압도적으로 빠름