In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm_notebook

from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression, Ridge
from fastFM.als import FMClassification, FMRegression

from sklearn.metrics import roc_auc_score, mean_squared_error

import gc

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
folder = 'ml-10M100K'
dtypes = {
    'user':   'uint32',
    'movie':  'uint16',
    'rating': 'uint8'
}
test_size = 0.25

In [3]:
def build_dataset_10m(folder, test_size):
    print('load ratings....')
    ratings = pd.read_csv(folder + '/ratings.dat', sep='::', header=None, engine='python',
                          names=['user', 'movie', 'rating', 'timestamp'], dtype=dtypes)

    print('build monthes....')
    ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')
    min_date = ratings.timestamp.min()
    ratings['monthes'] = (ratings.timestamp - min_date).dt.days // 28
    ratings.monthes = ratings.monthes.astype('uint8')
    ratings.drop('timestamp', 1, inplace=True)
    
    print('load movies....')
    movies = pd.read_csv(folder + '/movies.dat', sep='::', engine='python',
                         names=['movie', 'title', 'genres'], usecols=['movie', 'genres'], header=None, dtype=dtypes)

    print('build genres ohe....')
    sparse_genres = CountVectorizer().fit_transform(movies.genres.map(lambda x: x.replace('|', ' ')))
    colnames = ['genre_{}'.format(col) for col in range(sparse_genres.shape[1])]
    sparse_genres = pd.DataFrame(sparse_genres.todense().astype('uint8'), columns=colnames)
    movies = pd.concat([movies[['movie']], sparse_genres], axis=1)
    del(sparse_genres); gc.collect()
    
    print('join dataframes....')
    dataset = pd.merge(ratings, movies, on='movie', how='inner')
    del(ratings, movies); gc.collect()
    
    np.random.seed(42)
    print('build train/test split...')
    test_indexes = np.random.choice(dataset.index, int(test_size * dataset.shape[0]), replace=False)
    test = dataset.loc[test_indexes]
    train = dataset.drop(test_indexes)
    del(dataset); gc.collect();
    
    print('building rated movies....')
    rated_movies = train.groupby('user')['movie'].agg(lambda x: list(x))
    train.loc[:, 'ratedMovies'] = train.user.map(rated_movies)
    test.loc[:, 'ratedMovies'] = test.user.map(rated_movies)
    del(rated_movies); gc.collect()
    print('adding rated movies done')
    return train, test

# train, test = build_dataset_10m(folder, test_size)

In [6]:
folder = 'ml-1m'
def build_dataset_1m(folder, test_size):
    print('load ratings....')
    ratings = pd.read_csv(folder + '/ratings.dat', sep='::', header=None, engine='python',
                          names=['user', 'movie', 'rating', 'timestamp'], dtype=dtypes)

    print('build monthes....')
    ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')
    min_date = ratings.timestamp.min()
    ratings['monthes'] = (ratings.timestamp - min_date).dt.days // 28
    ratings.monthes = ratings.monthes.astype('uint8')
    ratings.drop('timestamp', 1, inplace=True)
    
    print('load movies....')
    movies = pd.read_csv(folder + '/movies.dat', sep='::', engine='python',
                         names=['movie', 'title', 'genres'], usecols=['movie', 'genres'], header=None, dtype=dtypes)

    print('build genres ohe....')
    sparse_genres = CountVectorizer().fit_transform(movies.genres.map(lambda x: x.replace('|', ' ')))
    colnames = ['genre_{}'.format(col) for col in range(sparse_genres.shape[1])]
    sparse_genres = pd.DataFrame(sparse_genres.todense().astype('uint8'), columns=colnames)
    movies = pd.concat([movies[['movie']], sparse_genres], axis=1)
    del(sparse_genres); gc.collect()
    
    print('load users....')
    users = pd.read_csv(folder + '/users.dat', sep='::', header=None, names=['user', 'gender', 'age', 'occupation', 'zip'], engine='python')
    users.zip = np.unique(users.zip.values, return_inverse=True)[1]
    
    print('join dataframes....')
    dataset = pd.merge(ratings, movies, on='movie', how='inner')
    dataset = pd.merge(dataset, users, on='user', how='left')
    del(ratings, movies); gc.collect()
    
    np.random.seed(42)
    print('build train/test split...')
    test_indexes = np.random.choice(dataset.index, int(test_size * dataset.shape[0]), replace=False)
    test = dataset.loc[test_indexes]
    train = dataset.drop(test_indexes)
    del(dataset); gc.collect();
    
    print('building rated movies....')
    rated_movies = train.groupby('user')['movie'].agg(lambda x: list(x))
    train.loc[:, 'ratedMovies'] = train.user.map(rated_movies)
    test.loc[:, 'ratedMovies'] = test.user.map(rated_movies)
    del(rated_movies); gc.collect()
    print('preprocessing done....')
    return train, test

train, test = build_dataset_1m(folder, test_size)

load ratings....
build monthes....
load movies....
build genres ohe....
load users....
join dataframes....
build train/test split...
building rated movies....
preprocessing done....


## Data transform

In [7]:
users_len = train.user.append(test.user).nunique()#6040
movies_len = train.movie.append(test.movie).max() #3952
genre_len = len([col for col in train.columns if 'genre' in col])
occupation_len = train.occupation.append(test.occupation).nunique()
zip_len = train.zip.append(test.zip).nunique()
train.head()

Unnamed: 0,user,movie,rating,monthes,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,...,genre_15,genre_16,genre_17,genre_18,genre_19,gender,age,occupation,zip,ratedMovies
1,2,1193,5,8,0,0,0,0,0,0,...,0,0,0,0,0,M,56,16,2248,"[1193, 3105, 2321, 1962, 1207, 2028, 1246, 306..."
2,12,1193,4,8,0,0,0,0,0,0,...,0,0,0,0,0,M,25,12,1165,"[1193, 2804, 1198, 593, 1247, 1641, 1221, 111,..."
3,15,1193,4,8,0,0,0,0,0,0,...,0,0,0,0,0,M,25,7,904,"[1193, 3408, 3105, 2321, 527, 2762, 260, 2028,..."
4,17,1193,5,8,0,0,0,0,0,0,...,0,0,0,0,0,M,50,1,3187,"[1193, 595, 2321, 720, 1270, 527, 1097, 2762, ..."
5,18,1193,4,8,0,0,0,0,0,0,...,0,0,0,0,0,F,18,3,3227,"[1193, 1197, 919, 595, 2018, 2797, 527, 48, 10..."


In [8]:
from tqdm import tqdm

def train2format(data, features_extractor, train_output='train', 
                 with_user_features=False, with_rated_films=False):
    writer_train = open(train_output, 'w')
    for row in tqdm(data.iterrows(), total=data.shape[0], miniters=1000):
        label = str(int(row[1]['rating']))
        features = features_extractor(row[1], with_user_features, with_rated_films)
        output_line = '{0} {1}\n'.format(label, features)
        writer_train.write(output_line)            
    writer_train.close()

def test2format(data, features_extractor, x_test_output='test', y_test_output='ytest', 
                with_user_features=False, with_rated_films=False):
    writer_test = open(x_test_output, 'w')
    writer_ytest = open(y_test_output, 'w')
    for row in tqdm(data.iterrows(), total=data.shape[0]):
        label = str(int(row[1]['rating']))
        features = features_extractor(row[1], with_user_features, with_rated_films)
        
        output_line = '{0} {1}\n'.format(label, features)
        writer_test.write(output_line)
        writer_ytest.write('%s\n' % label) 
    
    writer_test.close()
    writer_ytest.close()

In [9]:
def fm_extractor(row, with_user_features=False, with_rated_films=False):
    offset = 0
    output_line = '{}:1 '.format(row['user'] + offset)
    
    offset += users_len
    output_line += '{}:1 '.format(row['movie'] + offset)
    
    offset += movies_len
    output_line += '{0}:{1} '.format(offset, row['monthes'])
    offset += 1
    
    for genre_index in range(genre_len):
        if row['genre_{}'.format(genre_index)] == 1:
            output_line += '{0}:1 '.format(offset + genre_index)
    offset += genre_len
    
    if with_user_features:
        if row['gender']=='F':
            output_line += '{}:1 '.format(offset)
        output_line += '{0}:{1} '.format(offset + 1, row['age'])
        offset += 2
        output_line += '{}:1 '.format(row['occupation'] + offset)
        offset += occupation_len
        output_line += '{}:1'.format(row['zip'] + offset)
    
    if with_rated_films:
        n_rated_movies = len(row['ratedMovies'])
        for movie_id in row['ratedMovies']:
            output_line += 'rated_{0}:{1:.3} '.format(movie_id + offset, 1 / n_rated_movies)
        
    return output_line

def vw_extractor(row, with_user_features=False, with_rated_films=False):
    output_line = '|ohe '
    output_line += 'user_{} '.format(row['user'])
    output_line += 'movie_{} '.format(row['movie'])
    output_line += '|d {} '.format(row['monthes']) 
    
    output_line += '|g '    
    for genre_index in range(genre_len):
        if row['genre_{}'.format(genre_index)] == 1:
            output_line += 'genre_{0} '.format(genre_index)
    
    if with_user_features:
        output_line += '|u '
        if row['gender']=='F':
            output_line += 'female '
        output_line += 'age:{} '.format(row['age'])
        output_line += 'occupation_{} '.format(row['occupation'])
        output_line += 'zip_{}'.format(row['zip'])
      
    if with_rated_films:
        output_line += ' |m '
        # checking for float nan
        if row['ratedMovies'] == row['ratedMovies']:
            n_rated_movies = len(row['ratedMovies'])
            for movie in row['ratedMovies']:
                output_line += 'rated_{0}:{1:.3} '.format(movie, 1 / n_rated_movies)
    return output_line

In [19]:
print('vw format')
for row in tqdm(train.head().iterrows(), total=train.head().shape[0], miniters=1000):
    print(vw_extractor(row[1], with_user_features=True))

print('\nfm format')
for row in tqdm(train.head().iterrows(), total=train.head().shape[0], miniters=1000):
    print(fm_extractor(row[1], with_user_features=True))

100%|██████████| 5/5 [00:00<00:00, 1155.46it/s]
100%|██████████| 5/5 [00:00<00:00, 1875.64it/s]

vw format
|ohe user_2 movie_1193 |d 8 |g genre_7 |u age:56 occupation_16 zip_2248
|ohe user_12 movie_1193 |d 8 |g genre_7 |u age:25 occupation_12 zip_1165
|ohe user_15 movie_1193 |d 8 |g genre_7 |u age:25 occupation_7 zip_904
|ohe user_17 movie_1193 |d 8 |g genre_7 |u age:50 occupation_1 zip_3187
|ohe user_18 movie_1193 |d 8 |g genre_7 |u female age:18 occupation_3 zip_3227

fm format
2:1 7233:1 9992:8 10000:1 10014:56 10031:1 12284:1
12:1 7233:1 9992:8 10000:1 10014:25 10027:1 11201:1
15:1 7233:1 9992:8 10000:1 10014:25 10022:1 10940:1
17:1 7233:1 9992:8 10000:1 10014:50 10016:1 13223:1
18:1 7233:1 9992:8 10000:1 10013:1 10014:18 10018:1 13263:1





# Regression problem

### VW

In [18]:
train2format(train, vw_extractor, 'train_vw', with_user_features=True)
test2format(test, vw_extractor, 'test_vw', with_user_features=True)

100%|██████████| 750157/750157 [03:37<00:00, 3454.51it/s]
100%|██████████| 250052/250052 [01:15<00:00, 3307.70it/s]


In [20]:
!head -n 3 train_vw

5 |ohe user_2 movie_1193 |d 8 |g genre_7 |u age:56 occupation_16 zip_2248
4 |ohe user_12 movie_1193 |d 8 |g genre_7 |u age:25 occupation_12 zip_1165
4 |ohe user_15 movie_1193 |d 8 |g genre_7 |u age:25 occupation_7 zip_904


In [36]:
! vw -d train_vw --loss_function squared -f model --sgd --learning_rate 0.01 -b 14 --passes 10 --cache_file cache
! vw -i model -t test_vw -r pred --quiet

final_regressor = model
Num weight bits = 14
learning rate = 0.01
initial_t = 1
power_t = 0.5
decay_learning_rate = 1
using cache_file = cache
ignoring text input in favor of cache input
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
25.000000 25.000000            1            1.0   5.0000   0.0000        5
19.408629 13.817259            2            2.0   4.0000   0.2828        5
17.694937 15.981244            4            4.0   5.0000   0.5963        5
15.274976 12.855016            8            8.0   5.0000   0.7870        5
12.569074 9.863172           16           16.0   5.0000   1.5531        5
9.429982 6.290891           32           32.0   3.0000   2.1728        5
6.723782 4.017583           64           64.0   4.0000   2.8678        5
4.600717 2.477652          128          128.0   5.0000   2.8351        5
3.209022 1.817326          256          256.0   5.0000   3

### FMs

В лог выводятся значения функции потерь на трейне и тесте соответственно на каждой эпохе

In [25]:
train2format(train, fm_extractor, 'train_fm', with_user_features=True)
test2format(test, fm_extractor, 'test_fm', with_user_features=True)

100%|██████████| 750157/750157 [03:52<00:00, 3229.51it/s]
100%|██████████| 250052/250052 [01:18<00:00, 3168.71it/s]


In [26]:
!head -n 3 train_fm

5 2:1 7233:1 9992:8 10000:1 10014:56 10031:1 12284:1
4 12:1 7233:1 9992:8 10000:1 10014:25 10027:1 11201:1
4 15:1 7233:1 9992:8 10000:1 10014:25 10022:1 10940:1


In [28]:
!./FM -t train_fm -e test_fm -o out -s regression -l 0.02

750157 750157
250052 250052
2
0 2.76859
0 2.77157
1 2.7783
1 2.78137
2 1.80095
2 1.80344
3 2.51389
3 2.52172
4 2.62882
4 2.63484
5 1.78706
5 1.79047
6 1.82475
6 1.8297
7 1.76371
7 1.76766
8 1.83583
8 1.83907
9 1.79054
9 1.79164
10 2.75035
10 2.75427
11 1.83894
11 1.85117
^C


## Check old train

Вышеприведенные фичи в стадии разработки: скор на них почему-то намного ниже

Так что в дополнение приведу пример работы программы на стабильных фичах

train.svmlite / test.svmlite

In [31]:
!head -n 3 ../veryownFM/data/train.svmlite

5 5245:1 8280:1 9994:1 12183:1 13462:1 
4 4260:1 8253:1 9994:3 10000:1 10361:1 13455:1 13456:1 13459:1 13470:1 
2 481:1 7245:1 9993:1 9994:2 10008:1 11893:1 13459:1 


In [1]:
!./FM -t train.svmlite -e test.svmlite -o out -s regression -l 0.002

750156 750156
250053 250053
2
0 0.936084
0 0.944985
1 0.920997
1 0.931663
2 0.914946
2 0.926583
3 0.911716
3 0.923997
4 0.909751
4 0.9225
5 0.908455
5 0.921564
6 0.907548
6 0.920942
7 0.906884
7 0.92051
8 0.906381
8 0.920199
9 0.905989
9 0.919969
10 0.905676
10 0.919796
11 0.905421
11 0.919661
12 0.905209
12 0.919553
13 0.905029
13 0.919466
14 0.904873
14 0.919391
15 0.904737
15 0.919328
16 0.904618
16 0.919274
^C


### VW SGD

Для чистоты эксперимента попробую то же самое на vw с SGD

In [2]:
! vw -d train.svmlite --loss_function squared -f model --sgd --learning_rate 0.01 -b 14 --passes 10 --cache_file cache

final_regressor = model
Num weight bits = 14
learning rate = 0.01
initial_t = 1
power_t = 0.5
decay_learning_rate = 1
using cache_file = cache
ignoring text input in favor of cache input
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
25.000000 25.000000            1            1.0   5.0000   0.0000        5
19.408629 13.817259            2            2.0   4.0000   0.2828        5
17.694937 15.981244            4            4.0   5.0000   0.5963        5
15.274976 12.855016            8            8.0   5.0000   0.7870        5
12.569074 9.863172           16           16.0   5.0000   1.5531        5
9.429982 6.290891           32           32.0   3.0000   2.1728        5
6.723782 4.017583           64           64.0   4.0000   2.8678        5
4.600717 2.477652          128          128.0   5.0000   2.8351        5
3.209022 1.817326          256          256.0   5.0000   3

**Как видим наша FM значительно выигрывает у vw.**

Попробуем vw с хитрыми дефолтными параметрами

### VW default

In [9]:
! vw -d train.svmlite --loss_function squared -f model -b 14 --passes 5 --cache_file cache

final_regressor = model
Num weight bits = 14
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
using cache_file = cache
ignoring text input in favor of cache input
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
25.000000 25.000000            1            1.0   5.0000   0.0000        5
17.615468 10.230935            2            2.0   4.0000   0.8014        5
13.655942 9.696417            4            4.0   5.0000   1.5734        5
10.017926 6.379910            8            8.0   5.0000   1.9050        5
6.144310 2.270693           16           16.0   5.0000   3.4611        5
3.655404 1.166499           32           32.0   3.0000   4.2105        5
2.022526 0.389649           64           64.0   4.0000   4.4035        5
1.286314 0.550102          128          128.0   5.0000   4.3095        5
1.013301 0.740287          256          256.0   5.0000   4.421

**Тут уже за счет всевозможных трюков для линейных моделей выигрывает vw**

# Classification problem

Попробуем на том же датасете, считая оценки 4 и 5 - положительными, а прочие отрицательными

In [9]:
train.rating = (train.rating > 3).astype('uint8')
test.rating = (test.rating > 3).astype('uint8')
train.head()

Unnamed: 0,user,movie,rating,monthes,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,...,genre_15,genre_16,genre_17,genre_18,genre_19,gender,age,occupation,zip,ratedMovies
1,2,1193,1,8,0,0,0,0,0,0,...,0,0,0,0,0,M,56,16,2248,"[1193, 3105, 2321, 1962, 1207, 2028, 1246, 306..."
2,12,1193,1,8,0,0,0,0,0,0,...,0,0,0,0,0,M,25,12,1165,"[1193, 2804, 1198, 593, 1247, 1641, 1221, 111,..."
3,15,1193,1,8,0,0,0,0,0,0,...,0,0,0,0,0,M,25,7,904,"[1193, 3408, 3105, 2321, 527, 2762, 260, 2028,..."
4,17,1193,1,8,0,0,0,0,0,0,...,0,0,0,0,0,M,50,1,3187,"[1193, 595, 2321, 720, 1270, 527, 1097, 2762, ..."
5,18,1193,1,8,0,0,0,0,0,0,...,0,0,0,0,0,F,18,3,3227,"[1193, 1197, 919, 595, 2018, 2797, 527, 48, 10..."


In [14]:
train2format(train, fm_extractor, 'train_fm_class', with_user_features=True)
test2format(test, fm_extractor, 'test_fm_class', 'ytest_fm_class', with_user_features=True)

100%|██████████| 750157/750157 [03:59<00:00, 3137.02it/s]
100%|██████████| 250052/250052 [01:20<00:00, 3106.19it/s]
