# Generating data for NFM

In this notebook, the MovieLens10m is preprocessed to be fed by the LibFM's script for generating data in libfm's format.

In [10]:
if not os.path.exists('./ml-10M100K/'):
    !wget http://files.grouplens.org/datasets/movielens/ml-10m.zip
    !unzip ml-10m.zip
    !rm ml-10m.zip
    print('Successfully downloaded')
else:
    print('Data already downloaded')

Data already downloaded


In [5]:
import pandas as pd
import os

In [7]:
PATH_TO_LOAD = os.path.abspath('./ml-10M100K/')

if not os.path.exists(os.path.join(PATH_TO_LOAD, 'libfmFirst')):
    os.mkdir(os.path.join(PATH_TO_LOAD, 'libfmFirst'))

PATH_TO_SAVE = os.path.abspath(os.path.join(PATH_TO_LOAD, 'libfmFirst'))

df = pd.read_csv(os.path.join(PATH_TO_LOAD, 'ratings.dat'), sep='::', encoding='utf8', engine='python', 
                 header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [10]:
df = df.drop(columns='Timestamp')
df['Rating'] = df['Rating'].apply(lambda x: 1 if x >= 3 else -1)
df.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,122,1
1,1,185,1
2,1,231,1
3,1,292,1
4,1,316,1


In [11]:
import numpy as np

In [12]:
np.random.seed(42)
idx = np.random.permutation(df.shape[0])
idx_train = idx[:round(df.shape[0]*0.8)]
idx_test = idx[round(df.shape[0]*0.8):]

df_train = df.iloc[idx_train]
df_test = df.iloc[idx_test]

df.shape, df_train.shape, df_test.shape

((10000054, 3), (8000043, 3), (2000011, 3))

In [13]:
np.random.seed(42)
idx = np.random.permutation(df_test.shape[0])
idx_valid = idx[round(df_test.shape[0]*0.5):]
idx_test = idx[:round(df_test.shape[0]*0.5)]

df_valid = df_test.iloc[idx_valid]
df_test = df_test.iloc[idx_test]

df_test.shape, df_valid.shape

((1000006, 3), (1000005, 3))

In [17]:
df_train = df_train.astype(str).apply(lambda x: '::'.join(x), axis=1)
df_train.to_csv(os.path.join(PATH_TO_SAVE, 'train'), header=None, index=None, encoding='utf8')

df_test = df_test.astype(str).apply(lambda x: '::'.join(x), axis=1)
df_test.to_csv(os.path.join(PATH_TO_SAVE, 'test'), header=None, index=None, encoding='utf8')

df_valid = df_valid.astype(str).apply(lambda x: '::'.join(x), axis=1)
df_valid.to_csv(os.path.join(PATH_TO_SAVE, 'valid'), header=None, index=None, encoding='utf8')

In [18]:
df_train.head()

123313      958::1270::-1
5639406     40310::364::1
9678274    69359::2916::1
1137334      8467::296::1
241194      1883::1356::1
dtype: object

Then, in order to generate the `libfm` files, run the following script - `triple_format_to_libfm.pl` - from the [Official libFM library](https://github.com/srendle/libfm) in your terminal (assuming that the saved data is located at relative path _./MovieLensDataset/ml-10M100K/libfmFirst_):

`$ ./triple_format_to_libfm.pl -in ./MovieLensDataset/ml-10M100K/libfmFirst/train,./MovieLensDataset/ml-10M100K/libfmFirst/test,./MovieLensDataset/ml-10M100K/libfmFirst/valid -target 2 -separator "::"`