In [1]:
import numpy as np
import pandas as pd

import gc
import os

from tqdm import tqdm_notebook, tqdm
import time

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error

import matplotlib.pylab as plt
%matplotlib inline

dtypes = {
    'user':   'uint32',
    'movie':  'uint16',
    'rating': 'uint8'
}

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Datasets preparation helpers
### Buillding movielens dataset

**Основные паки признаков:**
- OHE юзеров и фильмов
- Время (число месяцев, прошедшее от первой записи)
- OHE жанров фильмов
- Признаки пользователей (только для ml-1m)
- Факт оценивания фильма (OHE)

In [2]:
# построение различных признаков для movielens
def build_movielens(folder, test_size, with_genres=True, with_users_info=True, with_rated_movies=True):
    print('load ratings....')
    ratings = pd.read_csv(folder + 'ratings.dat', sep='::', header=None, engine='python',
                          names=['user', 'movie', 'rating', 'timestamp'], dtype=dtypes)

    print('calculation of monthes....')
    ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')
    min_date = ratings.timestamp.min()
    ratings['monthes'] = (ratings.timestamp - min_date).dt.days // 28
    ratings.monthes /= ratings.monthes.max()
    ratings.monthes = ratings.monthes.astype('float16')
    dataset = ratings.drop('timestamp', 1)
    del(ratings); gc.collect()
    
    if with_genres and 'movies.dat' in os.listdir(folder):
        print('load movies....')
        movies = pd.read_csv(folder + 'movies.dat', sep='::', engine='python',
                             names=['movie', 'title', 'genres'], usecols=['movie', 'genres'], header=None, dtype=dtypes)

        print('build genres ohe....')
        sparse_genres = CountVectorizer().fit_transform(movies.genres.map(lambda x: x.replace('|', ' ')))
        colnames = ['genre_{}'.format(col) for col in range(sparse_genres.shape[1])]
        sparse_genres = pd.DataFrame(sparse_genres.todense().astype('uint8'), columns=colnames)
        movies = pd.concat([movies[['movie']], sparse_genres], axis=1)
        del(sparse_genres); gc.collect()        

        print('join dataframes....')
        dataset = pd.merge(dataset, movies, on='movie', how='inner')
        del(movies); gc.collect()
    else:
        print('genres skipped')
    
    if with_users_info and 'users.dat' in os.listdir(folder):
        print('load users info....')
        users = pd.read_csv(folder + 'users.dat', sep='::', header=None, names=['user', 'gender', 'age', 'occupation', 'zip'], engine='python')
        users.age    = (users.age / users.age.max()).astype('float16')
        users.gender = users.gender.apply(lambda x: 1 if x=='M' else 0).astype('int8')
        users.occupation = users.occupation.astype('int8')
        users.zip    = np.unique(users.zip.values, return_inverse=True)[1]
        users.zip = users.zip.astype('int16')
        dataset = pd.merge(dataset, users, on='user', how='left')
        del(users); gc.collect()
    else:
        print('users info skipped')

    np.random.seed(42)
    print('train/test split...')
    test_indexes = np.random.choice(dataset.index, int(test_size * dataset.shape[0]), replace=False)
    test = dataset.loc[test_indexes]
    train = dataset.drop(test_indexes)
    del(dataset); gc.collect();
    
    if with_rated_movies:
        print('building rated movies history (on train)....')
        rated_movies = train.groupby('user')['movie'].agg(lambda x: list(x))
        train.loc[:, 'ratedMovies'] = train.user.map(rated_movies)
        test.loc[:, 'ratedMovies']  = test.user.map(rated_movies)
        del(rated_movies); gc.collect()
    else:
        print('rated movies history skipped')
        
    print('preprocessing done....')
    return train, test

## Helpers

Используются для преведения полученного датасета в заданный формат

In [2]:
# хелпер для кодирования категориальных признаков
def get_offset_stats(train, test):
    offset_stats = {}
    offset_stats['users_len']  = train.user.append(test.user).max() + 1#6040
    offset_stats['movie_len'] = train.movie.append(test.movie).max() + 1 #3952
    offset_stats['genre_len']  = len([col for col in train.columns if 'genre' in col])
    if 'occupation' in train.columns:
        offset_stats['occupation_len'] = train.occupation.append(test.occupation).max() + 1
        offset_stats['zip_len']        = train.zip.append(test.zip).max() + 1
    return offset_stats

# функции, преобразующие датасет в формат, заданный feature_extractor
def train2format(data, features_extractor, offset_lens, train='train',
                 with_normalization=False,
                 with_user_features=False,
                 with_rated_films=False):
    
    writer_train      = open(train, 'w')    
    for row in tqdm(data.iterrows(), total=data.shape[0], miniters=1000):
        features = features_extractor(
            row[1], offset_lens, with_normalization,
            with_user_features, with_rated_films)
        
        label = str(int(row[1]['rating']))
        output_line = '{0} {1}\n'.format(label, features)
        writer_train.write(output_line)            
    writer_train.close()

def test2format(data, features_extractor, offset_lens, 
                x_test_output='test', y_test_output='ytest', 
                with_normalization=False,
                with_user_features=False,
                with_rated_films=False):
    
    writer_test = open(x_test_output, 'w')
    writer_ytest = open(y_test_output, 'w')
    for row in tqdm(data.iterrows(), total=data.shape[0]):
        label = str(int(row[1]['rating']))
        features = features_extractor(
            row[1], offset_lens, with_normalization,
            with_user_features, with_rated_films)
        
        output_line = '{0} {1}\n'.format(label, features)
        writer_test.write(output_line)
        writer_ytest.write('%s\n' % label) 
    
    writer_test.close()
    writer_ytest.close()

## Extractor
Собственно обработка одной строчки датасета

In [3]:
def fm_extractor(row, field_info, with_normalization=False, with_user_features=False, with_rated_films=False):
    offset = 0
    current_cat_value = ('{0:.2}'.format(1 / field_info['users_len']) if with_normalization else '1')
    output_line = '{0}:{1} '.format(row['user'] + offset, current_cat_value)
    
    offset += field_info['users_len']
    current_cat_value = ('{0:.2}'.format(1 / field_info['movie_len']) if with_normalization else '1')
    output_line += '{0}:{1} '.format(row['movie'] + offset, current_cat_value)
    
    offset += field_info['movie_len']
    output_line += '{0}:{1:.1} '.format(offset, row['monthes'])
    
    offset += 1
    current_cat_value = ('{0:.2}'.format(1 / field_info['genre_len']) if with_normalization else '1')
    for genre_index in range(field_info['genre_len']):
        if row['genre_{}'.format(genre_index)] == 1:
            output_line += '{0}:{1} '.format(offset + genre_index, current_cat_value)
            
    offset += field_info['genre_len']
    if with_user_features:
        output_line += '{0}:{1} '.format(offset, row['gender'])
        offset += 1
        output_line += '{0}:{1:.1} '.format(offset, row['age'])
        offset += 1
        
        current_cat_value = ('{0:.2}'.format(1 / field_info['occupation_len']) if with_normalization else '1')
        output_line += '{0}:{1} '.format(row['occupation'] + offset, current_cat_value)
        offset += field_info['occupation_len']
        
        current_cat_value = ('{0:.2}'.format(1 / field_info['zip_len']) if with_normalization else '1')
        output_line += '{0}:{1} '.format(row['zip'] + offset, current_cat_value)
        offset += field_info['zip_len']
    
    if with_rated_films:
        n_rated_movies = len(row['ratedMovies'])
        for movie_id in row['ratedMovies']:
            output_line += '{0}:{1:.3} '.format(movie_id + offset, 1 / n_rated_movies)
        
    return output_line

## Сonversion between vw/fm and regression/classification
Хелперы для преобразования между vw / fm форматами

In [4]:
def fm2vw(infile, outfile):    
    input_file = open(infile,  'r')
    out_file   = open(outfile, 'w')
    for line in tqdm(input_file):
        out_file.write(line[0] + ' |' + line[1:])

def reg2fm(infile, outfile):    
    input_file = open(infile,  'r')
    out_file   = open(outfile, 'w')
    for line in tqdm(input_file):
        target = str(int(int(line[0]) > 3))
        out_file.write(target + line[1:])

def reg2vw(infile, outfile):    
    input_file = open(infile,  'r')
    out_file   = open(outfile, 'w')
    for line in tqdm(input_file):
        target = str((int(line[0]) > 3) * 2 - 1)
        out_file.write(target + ' |' + line[1:])

## Inplace metrics

Функции, позволяющие оценить те или иные метрики, не загружаю в память таблички

In [5]:
def get_rmse(ytest_input='ytest', pred_input='pred'):
    n, loss = 0, 0
    reader_ytest = open(ytest_input, 'r')
    reader_pred = open(pred_input, 'r')

    for label, pred in zip(reader_ytest, reader_pred):    
        n+=1
        true_score = float(label)
        pred_score = float(pred)
        loss += np.square(pred_score - true_score)
    reader_ytest.close()
    reader_pred.close()
    return np.sqrt(loss / n)

def get_log_loss(ytest_input='ytest', pred_input='pred'):
    n, loss = 0, 0
    reader_ytest = open(ytest_input, 'r')
    reader_pred = open(pred_input, 'r')

    for label, pred in zip(reader_ytest, reader_pred):    
        n+=1        
        true_label = int(label)
        pred_score = float(pred)
        loss -= true_label * np.log(pred_score) + (1 - true_label) * np.log(1 - pred_score) 
        
    reader_ytest.close()
    reader_pred.close()
    return loss / n

# Regression

Попробуем для начала на датасете ml-1m, используя нормализацию и все полученные фичи, кроме ratedMovies

Тут мы и решим, делает ли нам нормализация категориальных признаков хорошо или нет

In [5]:
folder = 'SourceData/ml-1m/'
test_size = 0.25
train, test = build_movielens(folder, test_size)
field_info = get_offset_stats(train, test)
train.head()

load ratings....
calculation of monthes....
load movies....
build genres ohe....
join dataframes....
load users info....
train/test split...
building rated movies history (on train)....
preprocessing done....


Unnamed: 0,user,movie,rating,monthes,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,...,genre_15,genre_16,genre_17,genre_18,genre_19,gender,age,occupation,zip,ratedMovies
1,2,1193,5,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,1,1.0,16,2248,"[1193, 3105, 2321, 1962, 1207, 2028, 1246, 306..."
2,12,1193,4,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,1,0.446533,12,1165,"[1193, 2804, 1198, 593, 1247, 1641, 1221, 111,..."
3,15,1193,4,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,1,0.446533,7,904,"[1193, 3408, 3105, 2321, 527, 2762, 260, 2028,..."
4,17,1193,5,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,1,0.893066,1,3187,"[1193, 595, 2321, 720, 1270, 527, 1097, 2762, ..."
5,18,1193,4,0.216187,0,0,0,0,0,0,...,0,0,0,0,0,0,0.321533,3,3227,"[1193, 1197, 919, 595, 2018, 2797, 527, 48, 10..."


In [20]:
# train2format(train, fm_extractor, field_info, 'Generated/train_1m_norm_rated_user_reg.fm',
#              with_normalization=True, with_rated_films=True, with_user_features=True)
test2format(test, fm_extractor, field_info, 'Generated/test_1m_norm_rated_user_reg.fm', 'Generated/ytest_1m_norm_rated_user_reg', 
            with_normalization=True, with_rated_films=True, with_user_features=True)

100%|██████████| 250052/250052 [05:54<00:00, 704.69it/s]


In [12]:
fm2vw('Generated/train_1m_norm_rated_user_reg.fm', 'Generated/train_1m_norm_rated_user_reg.vw')
fm2vw('Generated/test_1m_norm_rated_user_reg.fm', 'Generated/test_1m_norm_rated_user_reg.vw')

750157it [00:19, 38361.29it/s]
250052it [00:07, 31341.53it/s]


In [23]:
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.01 -b 14 --passes 1 --cache_file VwHelpFiles/cache --quiet
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Validation RMSE: {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))

159783it [00:01, 153300.51it/s]

Validation RMSE: 1.2146800533131197





In [27]:
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.01 -b 14 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done')
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE: {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))

Training done
Start evaluation
Validation RMSE: 1.1977893616749653


In [28]:
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.01 -b 16 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done')
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE: {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))

Training done
Start evaluation
Validation RMSE: 1.2131628474324354


In [29]:
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.01 -b 12 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done')
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE: {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))

Training done
Start evaluation
Validation RMSE: 1.1978477294952559


In [30]:
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.1 -b 14 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done')
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE: {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))

Training done
Start evaluation
Validation RMSE: 1.2541670200048343


In [31]:
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 5 --cache_file VwHelpFiles/cache --quiet
print('Training done')
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE: {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))

Training done
Start evaluation
Validation RMSE: 1.166235625365969


In [36]:
t_start = time.time()
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 10 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 67 seconds
Start evaluation
Validation RMSE: 1.147067811731794
Time:	94


In [37]:
t_start = time.time()
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 20 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 119 seconds
Start evaluation
Validation RMSE:	 1.1341530769025163
Time:		137 seconds


In [38]:
t_start = time.time()
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --sgd --learning_rate 0.001 -b 14 --passes 40 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 234 seconds
Start evaluation
Validation RMSE:	 1.1256564403392062
Time:		254 seconds


**Лучшая SGD VW-модель на данных признаках дала 1.125 RMSE**

In [6]:
t_start = time.time()
!vw Generated/train_1m_norm_rated_user_reg.vw -f VwHelpFiles/model --loss_function squared --learning_rate 0.001 -b 14 --passes 40 --cache_file VwHelpFiles/cache --quiet
print('Training done - {0} seconds'.format(int(time.time() - t_start)))
! vw -i VwHelpFiles/model -t Generated/test_1m_norm_rated_user_reg.vw -r VwHelpFiles/pred --quiet
print('Start evaluation')
print('Validation RMSE:\t {}'.format(get_rmse('Generated/ytest_1m_norm_rated_user_reg', 'VwHelpFiles/pred')))
print('Time:\t\t{} seconds'.format(int(time.time() - t_start)))

Training done - 214 seconds
Start evaluation
Validation RMSE:	 1.0815592029921577
Time:		230 seconds


**Лучшая VW-модель на данных признаках дала 1.081 RMSE**

In [6]:
t_start = time.time()
!./FM --bias true --linear true --task_type regression -t Generated/train_1m_norm_rated_user_reg.fm -e Generated/test_1m_norm_rated_user_reg.fm --learning_rate 0.01 --learning_method SGD

Preprocessing
^C


In [None]:
t_start = time.time()
!./FM --bias true --linear true --pairwise 4 --task_type regression -t Generated/train_1m_norm_rated_user_reg.fm -e Generated/test_1m_norm_rated_user_reg.fm --learning_rate 0.01 --learning_method SGD

In [3]:
folder = 'SourceData/ml-10M100K/'
test_size = 0.25
train, test = build_movielens(folder, test_size)
train.head()