In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm_notebook

from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression, Ridge
from fastFM.als import FMClassification, FMRegression

from sklearn.metrics import roc_auc_score, mean_squared_error

import gc

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
folder = 'ml-10M100K'
dtypes = {
    'user':   'uint32',
    'movie':  'uint16',
    'rating': 'uint8'
}
test_size = 0.25

In [3]:
def build_dataset_10m(folder, test_size):
    print('load ratings....')
    ratings = pd.read_csv(folder + '/ratings.dat', sep='::', header=None, engine='python',
                          names=['user', 'movie', 'rating', 'timestamp'], dtype=dtypes)

    print('build monthes....')
    ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')
    min_date = ratings.timestamp.min()
    ratings['monthes'] = (ratings.timestamp - min_date).dt.days // 28
    ratings.monthes = ratings.monthes.astype('uint8')
    ratings.drop('timestamp', 1, inplace=True)
    
    print('load movies....')
    movies = pd.read_csv(folder + '/movies.dat', sep='::', engine='python',
                         names=['movie', 'title', 'genres'], usecols=['movie', 'genres'], header=None, dtype=dtypes)

    print('build genres ohe....')
    sparse_genres = CountVectorizer().fit_transform(movies.genres.map(lambda x: x.replace('|', ' ')))
    colnames = ['genre_{}'.format(col) for col in range(sparse_genres.shape[1])]
    sparse_genres = pd.DataFrame(sparse_genres.todense().astype('uint8'), columns=colnames)
    movies = pd.concat([movies[['movie']], sparse_genres], axis=1)
    del(sparse_genres); gc.collect()
    
    print('join dataframes....')
    dataset = pd.merge(ratings, movies, on='movie', how='inner')
    del(ratings, movies); gc.collect()
    
    np.random.seed(42)
    print('build train/test split...')
    test_indexes = np.random.choice(dataset.index, int(test_size * dataset.shape[0]), replace=False)
    test = dataset.loc[test_indexes]
    train = dataset.drop(test_indexes)
    del(dataset); gc.collect();
    
    print('building rated movies....')
    rated_movies = train.groupby('user')['movie'].agg(lambda x: list(x))
    train.loc[:, 'ratedMovies'] = train.user.map(rated_movies)
    test.loc[:, 'ratedMovies'] = test.user.map(rated_movies)
    del(rated_movies); gc.collect()
    print('adding rated movies done')
    return train, test

# train, test = build_dataset_10m(folder, test_size)

In [42]:
folder = 'ml-1m'
def build_dataset_1m(folder, test_size):
    print('load ratings....')
    ratings = pd.read_csv(folder + '/ratings.dat', sep='::', header=None, engine='python',
                          names=['user', 'movie', 'rating', 'timestamp'], dtype=dtypes)

    print('build monthes....')
    ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')
    min_date = ratings.timestamp.min()
    ratings['monthes'] = (ratings.timestamp - min_date).dt.days // 28
    ratings.monthes = ratings.monthes.astype('uint8')
    ratings.drop('timestamp', 1, inplace=True)
    
    print('load movies....')
    movies = pd.read_csv(folder + '/movies.dat', sep='::', engine='python',
                         names=['movie', 'title', 'genres'], usecols=['movie', 'genres'], header=None, dtype=dtypes)

    print('build genres ohe....')
    sparse_genres = CountVectorizer().fit_transform(movies.genres.map(lambda x: x.replace('|', ' ')))
    colnames = ['genre_{}'.format(col) for col in range(sparse_genres.shape[1])]
    sparse_genres = pd.DataFrame(sparse_genres.todense().astype('uint8'), columns=colnames)
    movies = pd.concat([movies[['movie']], sparse_genres], axis=1)
    del(sparse_genres); gc.collect()
    
    print('load users....')
    users = pd.read_csv(folder + '/users.dat', sep='::', header=None, names=['user', 'gender', 'age', 'occupation', 'zip'], engine='python')
    users.zip = np.unique(users.zip.values, return_inverse=True)[1]
    
    print('join dataframes....')
    dataset = pd.merge(ratings, movies, on='movie', how='inner')
    dataset = pd.merge(dataset, users, on='user', how='left')
    del(ratings, movies); gc.collect()
    
    np.random.seed(42)
    print('build train/test split...')
    test_indexes = np.random.choice(dataset.index, int(test_size * dataset.shape[0]), replace=False)
    test = dataset.loc[test_indexes]
    train = dataset.drop(test_indexes)
    del(dataset); gc.collect();
    
    print('building rated movies....')
    rated_movies = train.groupby('user')['movie'].agg(lambda x: list(x))
    train.loc[:, 'ratedMovies'] = train.user.map(rated_movies)
    test.loc[:, 'ratedMovies'] = test.user.map(rated_movies)
    del(rated_movies); gc.collect()
    print('adding rated movies done')
    return train, test

train, test = build_dataset_1m(folder, test_size)

load ratings....
build monthes....
load movies....
build genres ohe....
load users....
join dataframes....
build train/test split...
building rated movies....
adding rated movies done


## Data transform

In [43]:
users_len = train.user.append(test.user).nunique()#6040
movies_len = train.movie.append(test.movie).nunique() #3952
genre_len = len([col for col in train.columns if 'genre' in col])
occupation_len = train.occupation.append(test.occupation).nunique()
zip_len = train.zip.append(test.zip).nunique()
train.head()

Unnamed: 0,user,movie,rating,monthes,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,...,genre_15,genre_16,genre_17,genre_18,genre_19,gender,age,occupation,zip,ratedMovies
1,2,1193,5,8,0,0,0,0,0,0,...,0,0,0,0,0,M,56,16,2248,"[1193, 3105, 2321, 1962, 1207, 2028, 1246, 306..."
2,12,1193,4,8,0,0,0,0,0,0,...,0,0,0,0,0,M,25,12,1165,"[1193, 2804, 1198, 593, 1247, 1641, 1221, 111,..."
3,15,1193,4,8,0,0,0,0,0,0,...,0,0,0,0,0,M,25,7,904,"[1193, 3408, 3105, 2321, 527, 2762, 260, 2028,..."
4,17,1193,5,8,0,0,0,0,0,0,...,0,0,0,0,0,M,50,1,3187,"[1193, 595, 2321, 720, 1270, 527, 1097, 2762, ..."
5,18,1193,4,8,0,0,0,0,0,0,...,0,0,0,0,0,F,18,3,3227,"[1193, 1197, 919, 595, 2018, 2797, 527, 48, 10..."


In [44]:
from tqdm import tqdm

def train2format(data, features_extractor, train_output='train', 
                 with_user_features=False, with_rated_films=False):
    writer_train = open(train_output, 'w')
    for row in tqdm(data.iterrows(), total=data.shape[0], miniters=1000):
        label = str(int(row[1]['rating']))
        features = features_extractor(row[1], with_user_features, with_rated_films)
        output_line = '{0} {1}\n'.format(label, features)
        writer_train.write(output_line)            
    writer_train.close()

def test2format(data, features_extractor, x_test_output='test', y_test_output='ytest', 
                with_user_features=False, with_rated_films=False):
    writer_test = open(x_test_output, 'w')
    writer_ytest = open(y_test_output, 'w')
    for row in tqdm(data.iterrows(), total=data.shape[0]):
        label = str(int(row[1]['rating']))
        features = features_extractor(row[1], with_user_features, with_rated_films)
        
        output_line = '{0} {1}\n'.format(label, features)
        writer_test.write(output_line)
        writer_ytest.write('%s\n' % label) 
    
    writer_test.close()
    writer_ytest.close()

In [50]:
def fm_extractor(row, with_user_features=False, with_rated_films=False):
    offset = 0
    output_line = '{}:1 '.format(row['user'] + offset)
    
    offset += users_len
    output_line += '{}:1 '.format(row['movie'] + offset)
    
    offset += movies_len
    output_line += '{0}:{1} '.format(offset, row['monthes'])
    offset += 1
    
    for genre_index in range(genre_len):
        if row['genre_{}'.format(genre_index)] == 1:
            output_line += '{0}:1 '.format(offset + genre_index)
    offset += genre_len
    
    if with_user_features:
        if row['gender']=='F':
            output_line += '{}:1 '.format(offset)
        output_line += '{0}:{1} '.format(offset + 1, row['age'])
        offset += 2
        output_line += '{}:1 '.format(row['occupation'] + offset)
        offset += occupation_len
        output_line += '{}:1'.format(row['zip'] + offset)
    
    if with_rated_films:
        n_rated_movies = len(row['ratedMovies'])
        for movie_id in row['ratedMovies']:
            output_line += 'rated_{0}:{1:.3} '.format(movie_id + offset, 1 / n_rated_movies)
        
    return output_line

def vw_extractor(row, with_user_features=False, with_rated_films=False):
    output_line = '|ohe '
    output_line += 'user_{} '.format(row['user'])
    output_line += 'movie_{} '.format(row['movie'])
    output_line += '|d {} '.format(row['monthes']) 
    
    output_line += '|g '    
    for genre_index in range(genre_len):
        if row['genre_{}'.format(genre_index)] == 1:
            output_line += 'genre_{0} '.format(genre_index)
    
    if with_user_features:
        output_line += '|u '
        if row['gender']=='F':
            output_line += 'female '
        output_line += 'age:{} '.format(row['age'])
        output_line += 'occupation_{} '.format(row['occupation'])
        output_line += 'zip_{}'.format(row['zip'])
      
    if with_rated_films:
        output_line += ' |m '
        # checking for float nan
        if row['ratedMovies'] == row['ratedMovies']:
            n_rated_movies = len(row['ratedMovies'])
            for movie in row['ratedMovies']:
                output_line += 'rated_{0}:{1:.3} '.format(movie, 1 / n_rated_movies)
    return output_line

In [8]:
print('vw format')
for row in tqdm(train.head().iterrows(), total=train.head().shape[0], miniters=1000):
    print(vw_extractor(row[1], with_user_features=True))

print('\nfm format')
for row in tqdm(train.head().iterrows(), total=train.head().shape[0], miniters=1000):
    print(fm_extractor(row[1]))

100%|██████████| 5/5 [00:00<00:00, 2449.37it/s]
100%|██████████| 5/5 [00:00<00:00, 2321.66it/s]

vw format
|ohe user_2 movie_1193 |d 8 |g genre_7 |u age:56 occupation_16 zip_70072
|ohe user_12 movie_1193 |d 8 |g genre_7 |u age:25 occupation_12 zip_32793
|ohe user_15 movie_1193 |d 8 |g genre_7 |u age:25 occupation_7 zip_22903
|ohe user_17 movie_1193 |d 8 |g genre_7 |u age:50 occupation_1 zip_95350
|ohe user_18 movie_1193 |d 8 |g genre_7 |u female age:18 occupation_3 zip_95825

fm format
2:1 7233:1 9746:8 9754:1 
12:1 7233:1 9746:8 9754:1 
15:1 7233:1 9746:8 9754:1 
17:1 7233:1 9746:8 9754:1 
18:1 7233:1 9746:8 9754:1 





## Regression problem

### VW

In [17]:
train2format(train, vw_extractor, 'train_vw', with_user_features=True)
test2format(test, vw_extractor, 'test_vw', with_user_features=True)

100%|██████████| 750157/750157 [03:33<00:00, 3516.14it/s]
100%|██████████| 250052/250052 [01:16<00:00, 3266.77it/s]


In [29]:
!head -n 3 train_vw

5 |ohe user_2 movie_1193 |d 8 |g genre_7 |u age:56 occupation_16 zip_70072
4 |ohe user_12 movie_1193 |d 8 |g genre_7 |u age:25 occupation_12 zip_32793
4 |ohe user_15 movie_1193 |d 8 |g genre_7 |u age:25 occupation_7 zip_22903


In [28]:
! vw -d train_vw --loss_function squared -f model --sgd --learning_rate 0.01 -b 14 --passes 1 --cache_file cache
! vw -i model -t test_vw -r pred --quiet

final_regressor = model
Num weight bits = 14
learning rate = 0.01
initial_t = 1
power_t = 0.5
using cache_file = cache
ignoring text input in favor of cache input
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
25.000000 25.000000            1            1.0   5.0000   0.0000        5
19.408629 13.817259            2            2.0   4.0000   0.2828        5
17.694937 15.981244            4            4.0   5.0000   0.5963        5
15.274976 12.855016            8            8.0   5.0000   0.7870        5
12.199007 9.123038           16           16.0   5.0000   1.5333        5
9.601720 7.004432           32           32.0   4.0000   2.1662        5
6.944009 4.286297           64           64.0   4.0000   2.8929        5
4.554644 2.165280          128          128.0   4.0000   2.7970        5
3.225699 1.896753          256          256.0   5.0000   4.1511        5
2.020579 

### FMs

In [52]:
train2format(train, fm_extractor, 'train_fm', with_user_features=True)
test2format(test, fm_extractor, 'test_fm', with_user_features=True)

100%|██████████| 750157/750157 [03:33<00:00, 3509.55it/s]
100%|██████████| 250052/250052 [01:12<00:00, 3425.75it/s]


In [56]:
!head -n 3 train_fm

5 2:1 7233:1 9746:8 9754:1 9768:56 9785:1 12038:1
4 12:1 7233:1 9746:8 9754:1 9768:25 9781:1 10955:1
4 15:1 7233:1 9746:8 9754:1 9768:25 9776:1 10694:1


In [57]:
!./FM -t train_fm -e test_fm -o out -s regression -l 0.1

750157 750157
250052 250052
2
0 2.79311
0 2.79525
1 1.80567
1 1.80812
2 2.7844
2 2.78769
3 1.76325
3 1.76661
4 2.74408
4 2.74905
5 2.75097
5 2.75473
6 1.79351
6 1.79794
7 1.80538
7 1.8092
8 1.84171
8 1.84945
^C


## Check old train

In [31]:
!head -n 3 ../veryownFM/data/train.svmlite

5 5245:1 8280:1 9994:1 12183:1 13462:1 
4 4260:1 8253:1 9994:3 10000:1 10361:1 13455:1 13456:1 13459:1 13470:1 
2 481:1 7245:1 9993:1 9994:2 10008:1 11893:1 13459:1 


In [59]:
!./FM -t ../veryownFM/data/train.svmlite -e ../veryownFM/data/test.svmlite -o out -s regression -l 0.02

750156 750156
250053 250053
2
0 0.967365
0 0.981407
1 0.966545
1 0.981351
2 0.966431
2 0.981534
3 0.966404
3 0.981657
^C
