In [1]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import minmax_scale

import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
from fastFM.datasets import make_user_item_regression

# This sets up a small test dataset.
X, y, _ = make_user_item_regression(n_user=1000, n_item=100)

# Convert dataset to binary classification task.
y_labels = np.ones_like(y)
y_labels[y < np.mean(y)] = -1
#print(y_labels)
X_train, X_test, y_train, y_test = train_test_split(X, y_labels)
#print(X_train.toarray())


In [3]:
print('[train] number of +1/number of y : ', (y_train==1).sum()/len(y_train))
print('[test] number of +1/number of y : ', (y_test==1).sum()/len(y_test))


[train] number of +1/number of y :  0.4921333333333333
[test] number of +1/number of y :  0.49452


# 1. pyFM

In [4]:
from pyfm import pylibfm

# check time
start_time = time.time()

# Build and train a Factorization Machine
pyFM = pylibfm.FM(num_factors=10, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.001, learning_rate_schedule="optimal")

pyFM.fit(X_train,y_train)

# check time
print("--- %s seconds ---" % (time.time() - start_time))

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 0.66384
-- Epoch 2
Training log loss: 0.61586
-- Epoch 3
Training log loss: 0.58063
-- Epoch 4
Training log loss: 0.55406
-- Epoch 5
Training log loss: 0.53334
-- Epoch 6
Training log loss: 0.51677
-- Epoch 7
Training log loss: 0.50304
-- Epoch 8
Training log loss: 0.49143
-- Epoch 9
Training log loss: 0.48133
-- Epoch 10
Training log loss: 0.47245
--- 23.32673478126526 seconds ---


In [7]:
alpha = 0.5

# prediction
preds = pyFM.predict(X_test)
preds = minmax_scale(preds)

#classification
preds[preds < alpha] = -1
preds[preds >= alpha] = 1

print('-----------------')
print('alpha: ', alpha)
print('acc:', accuracy_score(y_test, preds))
print('raito: ', (preds==1).sum()/len(preds))
print('-----------------')

-----------------
alpha:  0.5
acc: 0.7924
raito:  0.46996
-----------------


# 2. fast FM

In [13]:
from fastFM import sgd

# check time
start_time = time.time()

# fastFM = sgd.FMClassification(n_iter=1000, init_stdev=0.1, rank=2, step_size=0.02)
fastFM = sgd.FMClassification(n_iter=500000, init_stdev=0.1, l2_reg_w=0,\
                          l2_reg_V=0, rank=2, step_size=0.1)
fastFM.fit(X_train, y_train)

preds_fm = fastFM.predict(X_test)

print('acc:', accuracy_score(y_test, preds_fm))
print('raito: ', (preds_fm==1).sum()/len(preds_fm))
# check time
print("--- %s seconds ---" % (time.time() - start_time))


acc: 0.97492
raito:  0.49168
--- 0.0913231372833252 seconds ---


In [9]:
# cf)
accuracy_score(y_train, fastFM.predict(X_train))

0.95676

In [10]:
for alpha in [0.5]:

    # probability 형태로도 예측 가능
    pred_proba = fastFM.predict_proba(X_test)
    pred_proba = minmax_scale(pred_proba)

    #classification
    pred_proba[pred_proba < alpha] = -1
    pred_proba[pred_proba >= alpha] = 1
    
    print('----------------')
    print('alpha: ', alpha)
    print('acc:', accuracy_score(y_test, pred_proba))
    print('raito: ', (pred_proba==1).sum()/len(pred_proba))
    print('----------------')

----------------
alpha:  0.5
acc: 0.95092
raito:  0.50448
----------------


# 3. SVM polynomial kernel

In [11]:
from sklearn.svm import SVC

In [12]:
# check time
start_time = time.time()

clf = SVC(C=1.0, kernel='poly', degree=2, gamma = 2)
classifier = clf.fit(X_train,y_train)

preds_SVC = classifier.predict(X_test)

print('acc:', accuracy_score(y_test, preds_SVC))
print('raito: ', (preds_SVC==1).sum()/len(preds_SVC))

# check time
print("--- %s seconds ---" % (time.time() - start_time))

acc: 0.97208
raito:  0.4974
--- 179.4590449333191 seconds ---


- fastFM은 0.018초 걸림.
    - parameter에 의존성이 좀 있음. interation number 등.
- SVC는 1.214초가 걸림.
    - parameter 설정할게 없고, degree만 정해주면 거의 최상의 값 찾아줌
- accuracy는 0.96대로 동일하지만 fast FM이 압도적으로 빠름