In [1]:
import numpy as np
import pandas as pd

import gc
import os

from tqdm import tqdm_notebook, tqdm
import time

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error

import matplotlib.pylab as plt
%matplotlib inline

dtypes = {
    'user':   'uint32',
    'movie':  'uint16',
    'rating': 'uint8'
}

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Datasets preparation helpers
### Buillding movielens dataset

**Основные паки признаков:**
- OHE юзеров и фильмов
- Время (число месяцев, прошедшее от первой записи)
- OHE жанров фильмов
- Признаки пользователей (только для ml-1m)
- Факт оценивания фильма (OHE)

In [2]:
# построение различных признаков для movielens
def build_movielens(folder, test_size, with_genres=True, with_users_info=True, with_rated_movies=True):
    print('load ratings....')
    ratings = pd.read_csv(folder + 'ratings.dat', sep='::', header=None, engine='python',
                          names=['user', 'movie', 'rating', 'timestamp'], dtype=dtypes)

    print('calculation of monthes....')
    ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')
    min_date = ratings.timestamp.min()
    ratings['monthes'] = (ratings.timestamp - min_date).dt.days // 28
    ratings.monthes /= ratings.monthes.max()
    ratings.monthes = ratings.monthes.astype('float16')
    dataset = ratings.drop('timestamp', 1)
    del(ratings); gc.collect()
    
    if with_genres and 'movies.dat' in os.listdir(folder):
        print('load movies....')
        movies = pd.read_csv(folder + 'movies.dat', sep='::', engine='python',
                             names=['movie', 'title', 'genres'], 
                             usecols=['movie', 'genres'], header=None, dtype=dtypes)

        print('build genres ohe....')
        sparse_genres = CountVectorizer().fit_transform(movies.genres.map(lambda x: x.replace('|', ' ')))
        colnames = ['genre_{}'.format(col) for col in range(sparse_genres.shape[1])]
        sparse_genres = pd.DataFrame(sparse_genres.todense().astype('uint8'), columns=colnames)
        movies = pd.concat([movies[['movie']], sparse_genres], axis=1)
        del(sparse_genres); gc.collect()        

        print('join dataframes....')
        dataset = pd.merge(dataset, movies, on='movie', how='inner')
        del(movies); gc.collect()
    else:
        print('genres skipped')
    
    if with_users_info and 'users.dat' in os.listdir(folder):
        print('load users info....')
        users = pd.read_csv(folder + 'users.dat', sep='::', 
                            header=None, names=['user', 'gender', 'age', 'occupation', 'zip'],
                            engine='python')
        users.age    = (users.age / users.age.max()).astype('float16')
        users.gender = users.gender.apply(lambda x: 1 if x=='M' else 0).astype('int8')
        users.occupation = users.occupation.astype('int8')
        users.zip    = np.unique(users.zip.values, return_inverse=True)[1]
        users.zip = users.zip.astype('int16')
        dataset = pd.merge(dataset, users, on='user', how='left')
        del(users); gc.collect()
    else:
        print('users info skipped')

    np.random.seed(42)
    print('train/test split...')
    test_indexes = np.random.choice(dataset.index, int(test_size * dataset.shape[0]), replace=False)
    test = dataset.loc[test_indexes]
    train = dataset.drop(test_indexes)
    del(dataset); gc.collect();
    
    if with_rated_movies:
        print('building rated movies history (on train)....')
        rated_movies = train.groupby('user')['movie'].agg(lambda x: list(x))
        train.loc[:, 'ratedMovies'] = train.user.map(rated_movies)
        test.loc[:, 'ratedMovies']  = test.user.map(rated_movies)
        del(rated_movies); gc.collect()
    else:
        print('rated movies history skipped')
        
    print('preprocessing done....')
    return train, test

## Helpers

Используются для преведения полученного датасета в заданный формат

In [3]:
# хелпер для кодирования категориальных признаков
def get_offset_stats(train, test):
    offset_stats = {}
    offset_stats['users_len']  = train.user.append(test.user).max() + 1#6040
    offset_stats['movie_len'] = train.movie.append(test.movie).max() + 1 #3952
    offset_stats['genre_len']  = len([col for col in train.columns if 'genre' in col])
    if 'occupation' in train.columns:
        offset_stats['occupation_len'] = train.occupation.append(test.occupation).max() + 1
        offset_stats['zip_len']        = train.zip.append(test.zip).max() + 1
    return offset_stats

# функции, преобразующие датасет в формат, заданный feature_extractor
def train2format(data, features_extractor, offset_lens, train='train',
                 with_normalization=False,
                 with_user_features=False,
                 with_rated_films=False):
    
    writer_train      = open(train, 'w')    
    for row in tqdm(data.iterrows(), total=data.shape[0], miniters=1000):
        features = features_extractor(
            row[1], offset_lens, with_normalization,
            with_user_features, with_rated_films)
        
        label = str(int(row[1]['rating']))
        output_line = '{0} {1}\n'.format(label, features)
        writer_train.write(output_line)            
    writer_train.close()

def test2format(data, features_extractor, offset_lens, 
                x_test_output='test', y_test_output='ytest', 
                with_normalization=False,
                with_user_features=False,
                with_rated_films=False):
    
    writer_test = open(x_test_output, 'w')
    writer_ytest = open(y_test_output, 'w')
    for row in tqdm(data.iterrows(), total=data.shape[0]):
        label = str(int(row[1]['rating']))
        features = features_extractor(
            row[1], offset_lens, with_normalization,
            with_user_features, with_rated_films)
        
        output_line = '{0} {1}\n'.format(label, features)
        writer_test.write(output_line)
        writer_ytest.write('%s\n' % label) 
    
    writer_test.close()
    writer_ytest.close()

## Extractor
Собственно обработка одной строчки датасета

In [4]:
def fm_extractor(row, field_info, with_normalization=False, with_user_features=False, with_rated_films=False):
    offset = 0
    current_cat_value = ('{0:.2}'.format(1 / field_info['users_len']) if with_normalization else '1')
    output_line = '{0}:{1} '.format(row['user'] + offset, current_cat_value)
    
    offset += field_info['users_len']
    current_cat_value = ('{0:.2}'.format(1 / field_info['movie_len']) if with_normalization else '1')
    output_line += '{0}:{1} '.format(row['movie'] + offset, current_cat_value)
    
    offset += field_info['movie_len']
    output_line += '{0}:{1:.1} '.format(offset, row['monthes'])
    
    offset += 1
    current_cat_value = ('{0:.2}'.format(1 / field_info['genre_len']) if with_normalization else '1')
    for genre_index in range(field_info['genre_len']):
        if row['genre_{}'.format(genre_index)] == 1:
            output_line += '{0}:{1} '.format(offset + genre_index, current_cat_value)
            
    offset += field_info['genre_len']
    if with_user_features:
        output_line += '{0}:{1} '.format(offset, row['gender'])
        offset += 1
        output_line += '{0}:{1:.1} '.format(offset, row['age'])
        offset += 1
        
        current_cat_value = ('{0:.2}'.format(1 / field_info['occupation_len']) if with_normalization else '1')
        output_line += '{0}:{1} '.format(row['occupation'] + offset, current_cat_value)
        offset += field_info['occupation_len']
        
        current_cat_value = ('{0:.2}'.format(1 / field_info['zip_len']) if with_normalization else '1')
        output_line += '{0}:{1} '.format(row['zip'] + offset, current_cat_value)
        offset += field_info['zip_len']
    
    if with_rated_films:
        n_rated_movies = len(row['ratedMovies'])
        for movie_id in row['ratedMovies']:
            output_line += '{0}:{1:.3} '.format(movie_id + offset, 1 / n_rated_movies)
        
    return output_line

## Сonversion between vw/fm and regression/classification
Хелперы для преобразования между vw / fm форматами

In [5]:
def fm2vw(infile, outfile):    
    input_file = open(infile,  'r')
    out_file   = open(outfile, 'w')
    for line in tqdm(input_file):
        out_file.write(line[0] + ' |' + line[1:])

def reg2fm(infile, outfile):    
    input_file = open(infile,  'r')
    out_file   = open(outfile, 'w')
    for line in tqdm(input_file):
        target = str(int(int(line[0]) > 3))
        out_file.write(target + line[1:])

def reg2vw(infile, outfile):    
    input_file = open(infile,  'r')
    out_file   = open(outfile, 'w')
    for line in tqdm(input_file):
        target = str((int(line[0]) > 3) * 2 - 1)
        out_file.write(target + ' |' + line[1:])

## Inplace metrics

Функции, позволяющие оценить те или иные метрики, не загружаю в память таблички

In [6]:
def get_rmse(ytest_input='ytest', pred_input='pred'):
    n, loss = 0, 0
    reader_ytest = open(ytest_input, 'r')
    reader_pred = open(pred_input, 'r')

    for label, pred in zip(reader_ytest, reader_pred):    
        n+=1
        true_score = float(label)
        pred_score = float(pred)
        loss += np.square(pred_score - true_score)
    reader_ytest.close()
    reader_pred.close()
    return np.sqrt(loss / n)

def get_log_loss(ytest_input='ytest', pred_input='pred'):
    n, loss = 0, 0
    reader_ytest = open(ytest_input, 'r')
    reader_pred = open(pred_input, 'r')

    for label, pred in zip(reader_ytest, reader_pred):    
        n+=1        
        true_label = int(label)
        pred_score = float(pred)
        loss -= true_label * np.log(pred_score) + (1 - true_label) * np.log(1 - pred_score) 
        
    reader_ytest.close()
    reader_pred.close()
    return loss / n

# Regression

Попробуем для начала на датасете ml-1m, используя нормализацию и все полученные фичи, кроме ratedMovies

Чуть позже же решим, делает ли нам нормализация категориальных признаков хорошо или нет

In [8]:
folder = 'SourceData/ml-1m/'
test_size = 0.25
train, test = build_movielens(folder, test_size)
field_info = get_offset_stats(train, test)
train.head()

load ratings....
calculation of monthes....
load movies....
build genres ohe....
join dataframes....
load users info....
train/test split...
building rated movies history (on train)....
preprocessing done....


Unnamed: 0,user,movie,rating,monthes,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,...,genre_15,genre_16,genre_17,genre_18,genre_19,gender,age,occupation,zip,ratedMovies
1,2,1193,5,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,1,1.0,16,2248,"[1193, 3105, 2321, 1962, 1207, 2028, 1246, 306..."
2,12,1193,4,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,1,0.446533,12,1165,"[1193, 2804, 1198, 593, 1247, 1641, 1221, 111,..."
3,15,1193,4,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,1,0.446533,7,904,"[1193, 3408, 3105, 2321, 527, 2762, 260, 2028,..."
4,17,1193,5,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,1,0.893066,1,3187,"[1193, 595, 2321, 720, 1270, 527, 1097, 2762, ..."
5,18,1193,4,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,0,0.321533,3,3227,"[1193, 1197, 919, 595, 2018, 2797, 527, 48, 10..."


In [8]:
train2format(train, fm_extractor, field_info, 'Generated/train_1m_norm_user_reg.fm',
             with_normalization=True, with_user_features=True)
test2format(test, fm_extractor, field_info, 'Generated/test_1m_norm_user_reg.fm', 'Generated/ytest_1m_norm_user_reg', 
            with_normalization=True, with_user_features=True)

100%|██████████| 750157/750157 [07:11<00:00, 1738.67it/s]
100%|██████████| 250052/250052 [02:41<00:00, 1551.64it/s]


**Для начала посмотрим, как же пользоваться этой нашей тулзой**

In [51]:
!./FM --help

Library for using factorization algorithm
Usage:
  Factorization machines [OPTION...]

  -l, --learning_rate arg       Learning rate value, default 0.1
      --bias                    Use bias or not
      --linear                  Use linear or not
      --pairwise arg            Two-way interactions order
  -r, --regularization_const arg
                                Regularization constant, default 0
  -i, --iterations arg          Number of iterations, default 100
  -m, --learning_method arg     Learning method (SGD, ALS), default SGD
  -g, --inplace arg             Storage (inplace, memory), default memory
  -t, --train_filename arg      Training file name
  -e, --test_filename arg       Testing file name
  -s, --task_type arg           Task type parameter
      --hash_size arg           Positive hash size if use hashing trick, else
                                -1, default -1
      --hash_random_seed arg    Random seed of hashing
  -h, --help               

Начнем жадно подбирать параметры для SGD и ALD независимо. Начнем со старого доброго **SGD**

In [23]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_norm_user_reg.fm -e Generated/test_1m_norm_user_reg.fm \
    --learning_rate 0.0001 --learning_method SGD --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.24986 Test=1.25241
iter=1 Train=1.24717 Test=1.24976
iter=2 Train=1.24731 Test=1.24994
iter=3 Train=1.2479 Test=1.25057
iter=4 Train=1.24863 Test=1.25135
Time:		215 seconds


In [20]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_norm_user_reg.fm -e Generated/test_1m_norm_user_reg.fm \
    --learning_rate 0.00001 --learning_method SGD --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.2283 Test=1.23035
iter=1 Train=1.19918 Test=1.20147
iter=2 Train=1.19037 Test=1.19275
iter=3 Train=1.18601 Test=1.18845
iter=4 Train=1.1833 Test=1.18577
Time:		255 seconds


In [30]:
t_start = time.time()
!./FM --bias true --linear true --pairwise 0 --task_type regression -t Generated/train_1m_norm_user_reg.fm -e Generated/test_1m_norm_user_reg.fm \
    --learning_rate 0.00001 --learning_method SGD --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.22793 Test=1.22998
iter=1 Train=1.19955 Test=1.20183
iter=2 Train=1.19078 Test=1.19316
iter=3 Train=1.18639 Test=1.18882
iter=4 Train=1.18362 Test=1.18608
Time:		136 seconds


In [22]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_norm_user_reg.fm -e Generated/test_1m_norm_user_reg.fm \
    --learning_rate 0.000001 --learning_method SGD --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.57304 Test=1.57461
iter=1 Train=1.29849 Test=1.29996
iter=2 Train=1.2531 Test=1.25456
iter=3 Train=1.23133 Test=1.23285
iter=4 Train=1.21515 Test=1.21673
Time:		219 seconds


In [29]:
t_start = time.time()
!./FM --bias true --linear true --pairwise 0 --task_type regression -t Generated/train_1m_norm_user_reg.fm -e Generated/test_1m_norm_user_reg.fm \
    --learning_rate 0.000001 --learning_method SGD --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.57427 Test=1.57586
iter=1 Train=1.29442 Test=1.29592
iter=2 Train=1.24919 Test=1.25068
iter=3 Train=1.22833 Test=1.22986
iter=4 Train=1.21293 Test=1.21453
Time:		127 seconds


In [26]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_norm_user_reg.fm -e Generated/test_1m_norm_user_reg.fm \
    --learning_rate 0.00001 --regularization_const 0.01 --learning_method SGD --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.23412 Test=1.23618
iter=1 Train=1.20378 Test=1.20607
iter=2 Train=1.19402 Test=1.19641
iter=3 Train=1.18942 Test=1.19184
iter=4 Train=1.18681 Test=1.18926
Time:		182 seconds


**Лучший SGD набирает порядка 1.18 RMSE на отложенной выборке**

In [24]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_norm_user_reg.fm -e Generated/test_1m_norm_user_reg.fm \
    --learning_method ALS --hash_size -1 -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=0.883127 Test=0.925434
iter=1 Train=0.863028 Test=0.923017
iter=2 Train=0.849879 Test=0.919942
iter=3 Train=0.840508 Test=0.915377
iter=4 Train=0.834058 Test=0.911848
Time:		859 seconds


In [28]:
t_start = time.time()
!./FM --bias true --linear true --pairwise 0 --task_type regression -t Generated/train_1m_norm_user_reg.fm -e Generated/test_1m_norm_user_reg.fm \
    --learning_method ALS --hash_size -1 -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=0.904478 Test=0.920831
iter=1 Train=0.895432 Test=0.911426
iter=2 Train=0.894925 Test=0.910885
iter=3 Train=0.894861 Test=0.910814
iter=4 Train=0.894837 Test=0.910785
Time:		165 seconds


In [29]:
t_start = time.time()
!./FM --bias true --linear true --pairwise 0 --task_type regression -t Generated/train_1m_norm_user_reg.fm -e Generated/test_1m_norm_user_reg.fm \
    --learning_method ALS --regularization_const 0.00001 --hash_size -1 -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=0.937126 Test=0.94576
iter=1 Train=0.935544 Test=0.943866
iter=2 Train=0.93519 Test=0.943468
iter=3 Train=0.934827 Test=0.943097
iter=4 Train=0.934496 Test=0.942765
Time:		161 seconds


**Лучший скор - 0.910**

**ALS результаты и вовсе внушительные, причем наличие парных взаимодействий на данном датасете на эффективность особо не влияет**

## VW
**Приведем наши датасеты к формату VW**

In [35]:
fm2vw('Generated/train_1m_norm_user_reg.fm', 'Generated/train_1m_norm_user_reg.vw')
fm2vw('Generated/test_1m_norm_user_reg.fm', 'Generated/test_1m_norm_user_reg.vw')

750157it [00:01, 673301.96it/s]
250052it [00:00, 718377.71it/s]


**И снова как мы любим начнем жадно выбирать параметры** Сначала для SGD

In [39]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.01 -b 14 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 4 seconds
Start evaluation
Validation RMSE:	 1.198904507129869
Time:		6 seconds


In [37]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 4 seconds
Start evaluation
Validation RMSE:	 1.1675452150871288
Time:		6 seconds


In [40]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.0001 -b 14 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 4 seconds
Start evaluation
Validation RMSE:	 1.5458927830958853
Time:		6 seconds


In [38]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 10 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 8 seconds
Start evaluation
Validation RMSE:	 1.1483929069769112
Time:		10 seconds


In [43]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 20 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 15 seconds
Start evaluation
Validation RMSE:	 1.1354925339260975
Time:		18 seconds


In [44]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 40 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 31 seconds
Start evaluation
Validation RMSE:	 1.1269815304667365
Time:		33 seconds


In [55]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 80 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 62 seconds
Start evaluation
Validation RMSE:	 1.1224261716600354
Time:		64 seconds


**Лучший результат на VW SGD - 1.1224**

Попробуем на хитрых дефолтных параметрах

In [46]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --learning_rate 0.001 -b 14 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 4 seconds
Start evaluation
Validation RMSE:	 1.397999386906915
Time:		6 seconds


In [48]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --learning_rate 0.01 -b 14 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 4 seconds
Start evaluation
Validation RMSE:	 1.068677025561811
Time:		6 seconds


In [56]:
t_start = time.time()
!vw Generated/train_1m_norm_user_reg.vw -f VwHelpFiles/model --loss_function squared --learning_rate 0.01 -b 14 --passes 80 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 70 seconds
Start evaluation
Validation RMSE:	 0.9439532525754946
Time:		72 seconds


**Лучший результат на VW - 0.944**

Напомню, что лучший скор нашей тулзы был 0.910

**И хотя какой-то хороший бейзлайн на VW получается довольно шустро, такого же качества, которое давал нам ALS добиться на VW не удалось**


# Ненормированные ml-1m
Посмотрим как себя поведут утилиты на ненормированных признаках

In [9]:
train2format(train, fm_extractor, field_info, 'Generated/train_1m_user_reg.fm',
             with_normalization=False, with_user_features=True)
test2format(test, fm_extractor, field_info, 'Generated/test_1m_user_reg.fm', 'Generated/ytest_1m_user_reg', 
            with_normalization=False, with_user_features=True)

100%|██████████| 750157/750157 [05:15<00:00, 2378.11it/s]
100%|██████████| 250052/250052 [01:51<00:00, 2241.35it/s]


In [10]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_rate 0.01 --learning_method SGD --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.16565 Test=1.17666
iter=1 Train=1.12185 Test=1.13626
iter=2 Train=1.10682 Test=1.12384
iter=3 Train=1.10884 Test=1.12796
iter=4 Train=1.11811 Test=1.13904
Time:		215 seconds


In [18]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm  \
    --learning_rate 0.01 --learning_method SGD --regularization_const 0.1 --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.81831 Test=1.81998
iter=1 Train=1.80246 Test=1.80387
iter=2 Train=1.80235 Test=1.80342
iter=3 Train=1.80293 Test=1.80416
iter=4 Train=1.80428 Test=1.80517
Time:		213 seconds


In [20]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_rate 0.01 --learning_method SGD --regularization_const 0.01 --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.07767 Test=1.08947
iter=1 Train=1.01272 Test=1.02826
iter=2 Train=0.987602 Test=1.00637
iter=3 Train=0.986864 Test=1.00808
iter=4 Train=0.997491 Test=1.02045
Time:		214 seconds


In [19]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_rate 0.01 --learning_method SGD --regularization_const 0.001 --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.15296 Test=1.16405
iter=1 Train=1.10143 Test=1.11605
iter=2 Train=1.07779 Test=1.09532
iter=3 Train=1.07129 Test=1.09123
iter=4 Train=1.07314 Test=1.09513
Time:		219 seconds


In [21]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_rate 0.001 --learning_method SGD --regularization_const 0.001 --hash_size -1 -g inplace -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.2642 Test=1.26991
iter=1 Train=1.21977 Test=1.22678
iter=2 Train=1.18414 Test=1.19214
iter=3 Train=1.15348 Test=1.16238
iter=4 Train=1.12807 Test=1.13787
Time:		214 seconds


In [23]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_rate 0.002 --learning_method SGD --regularization_const 0.01 --hash_size -1 -g inplace -i 10
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=1.19262 Test=1.19966
iter=1 Train=1.13433 Test=1.14319
iter=2 Train=1.09052 Test=1.10094
iter=3 Train=1.05602 Test=1.06792
iter=4 Train=1.02983 Test=1.04315
iter=5 Train=1.01051 Test=1.02516
iter=6 Train=0.996751 Test=1.01261
iter=7 Train=0.987356 Test=1.00432
iter=8 Train=0.981408 Test=0.999345
iter=9 Train=0.978387 Test=0.997183
Time:		389 seconds


**Лучший результат нашей тулзы на SGD показал 0.997 RMSE на отложенной выборке**

In [49]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_method ALS --hash_size -1 -g memory -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=0.864562 Test=0.917794
iter=1 Train=0.836503 Test=0.904523
iter=2 Train=0.822554 Test=0.896032
iter=3 Train=0.815551 Test=0.891903
iter=4 Train=0.811589 Test=0.889514
Time:		781 seconds


In [51]:
t_start = time.time()
!./FM --bias true --linear true --pairwise 0 --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_method ALS --hash_size -1 -g memory -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=0.904478 Test=0.920831
iter=1 Train=0.895432 Test=0.911426
iter=2 Train=0.894925 Test=0.910885
iter=3 Train=0.894861 Test=0.910814
iter=4 Train=0.894837 Test=0.910785
Time:		164 seconds


In [52]:
t_start = time.time()
!./FM --bias true --linear true --pairwise 0 --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_method ALS --regularization_const 0.01 --hash_size -1 -g memory -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=0.904478 Test=0.920826
iter=1 Train=0.895433 Test=0.911422
iter=2 Train=0.894925 Test=0.910881
iter=3 Train=0.894861 Test=0.910809
iter=4 Train=0.894837 Test=0.91078
Time:		157 seconds


In [53]:
t_start = time.time()
!./FM --bias true --linear true --pairwise 0 --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_method ALS --regularization_const 0.001 --hash_size -1 -g memory -i 5
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=0.904478 Test=0.92083
iter=1 Train=0.895432 Test=0.911426
iter=2 Train=0.894925 Test=0.910885
iter=3 Train=0.894861 Test=0.910813
iter=4 Train=0.894837 Test=0.910784
Time:		160 seconds


In [57]:
t_start = time.time()
!./FM --bias true --linear true --pairwise 0 --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_method ALS --regularization_const 0.001 --hash_size -1 -g memory -i 2
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=0.904478 Test=0.92083
iter=1 Train=0.895432 Test=0.911426
Time:		75 seconds


In [56]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_user_reg.fm -e Generated/test_1m_user_reg.fm \
    --learning_method ALS --hash_size -1 -g memory -i 10
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Preprocessing
Processed 750157 rows
Target from 1 to 5
Max feature index is 13476
Preprocessing
Processed 250052 rows
Target from 1 to 5
Max feature index is 13476
iter=0 Train=0.864562 Test=0.917794
iter=1 Train=0.836503 Test=0.904523
iter=2 Train=0.822554 Test=0.896032
iter=3 Train=0.815551 Test=0.891903
iter=4 Train=0.811589 Test=0.889514
iter=5 Train=0.809086 Test=0.88802
iter=6 Train=0.807374 Test=0.887033
iter=7 Train=0.806132 Test=0.886277
iter=8 Train=0.805194 Test=0.885726
iter=9 Train=0.804467 Test=0.885303
Time:		1844 seconds


**Итого лучший ALS с 10 проходами и 4 компонентами на парные взаимодействия показал 0.885 RMSE, обучаясь порядка 30 минут**

**Результат 0.91 же мог быть достигнут и без парных взаимодействий за 2 прохода достаточно шустро**

## Vowpal wabbit

In [30]:
fm2vw('Generated/train_1m_user_reg.fm', 'Generated/train_1m_user_reg.vw')
fm2vw('Generated/test_1m_user_reg.fm', 'Generated/test_1m_user_reg.vw')

750157it [00:00, 915839.16it/s]
250052it [00:00, 799685.63it/s]


In [39]:
t_start = time.time()
!vw Generated/train_1m_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 4 seconds
Start evaluation
Validation RMSE:	 1.1573913484777147
Time:		6 seconds


In [40]:
t_start = time.time()
!vw Generated/train_1m_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 10 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 8 seconds
Start evaluation
Validation RMSE:	 1.139532181507997
Time:		10 seconds


In [41]:
t_start = time.time()
!vw Generated/train_1m_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 20 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 16 seconds
Start evaluation
Validation RMSE:	 1.1276584044115077
Time:		19 seconds


In [43]:
t_start = time.time()
!vw Generated/train_1m_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 80 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 69 seconds
Start evaluation
Validation RMSE:	 1.1148919874990038
Time:		71 seconds


In [46]:
t_start = time.time()
!vw Generated/train_1m_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.002 -b 14 --passes 80 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 43 seconds
Start evaluation
Validation RMSE:	 1.109155811552204
Time:		45 seconds


In [78]:
t_start = time.time()
!vw Generated/train_1m_user_reg.vw -f VwHelpFiles/model --loss_function squared  --l2 0.1 -b 14 --passes 20 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 4 seconds
Start evaluation
Validation RMSE:	 3.207281084486174
Time:		6 seconds


**Лучший VW дал 1.109, что серьезно уступает SGD режиму FM**

**ALS режим же на данном датасете вообще из другой лиги**



# Ненормированный ml-10M100K

Сегодня мы многое поняли - например то, что нормировка не бог весть какая хорошая идея.

Так что в дальнейшем будем рассматривать ненормированные датасеты.

Более того, информация о пользователям нам уже не дана, так что эти фичи исключаются тоже

In [89]:
folder = 'SourceData/ml-10M100K/'
test_size = 0.25
train, test = build_movielens(
    folder, test_size, with_genres=True,
    with_users_info=False, with_rated_movies=False)
train.head()

load ratings....
calculation of monthes....
load movies....
build genres ohe....
join dataframes....
users info skipped
train/test split...
rated movies history skipped
preprocessing done....


Unnamed: 0,user,movie,rating,monthes,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,...,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19,genre_20,genre_21,genre_22,genre_23
1,139,122,3,0.41748,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,215,122,4,0.708984,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
5,217,122,3,0.12085,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
6,281,122,3,0.12085,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
7,326,122,3,0.109863,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
train2format(train, fm_extractor, field_info, 'Generated/train_10m_reg.fm',)
test2format(
    test, fm_extractor, field_info,
    'Generated/test_10m_reg.fm',
    'Generated/ytest_10m_reg'
)
!head -n 3 Generated/train_10m_reg.fm

  0%|          | 22000/7500041 [00:05<31:36, 3943.44it/s] 