# Generating data for NFM in the format suitable for libFM library

In this notebook we will directly generate data - MovieLens 10M - in `libfm` format including the contextual features: _tags_ and _genre_.

## Imports

In [2]:
import pandas as pd
import numpy as np
import os
import re

# Preprocessing `Tags`
## Data loading

In [3]:
if not os.path.exists('./ml-10M100K/'):
    !wget http://files.grouplens.org/datasets/movielens/ml-10m.zip
    !unzip ml-10m.zip
    !rm ml-10m.zip
    print('Successfully downloaded')
else:
    print('Data already downloaded')

Data already downloaded


In [2]:
PATH_TO_LOAD = os.path.abspath('./ml-10M100K/')

if not os.path.exists(os.path.join(PATH_TO_LOAD, 'libfmSecond')):
    os.mkdir(os.path.join(PATH_TO_LOAD, 'libfmSecond'))

PATH_TO_SAVE = os.path.abspath(os.path.join(PATH_TO_LOAD, 'libfmSecond'))

tags = pd.read_csv(os.path.join(PATH_TO_LOAD, 'tags.dat'), sep='::', encoding='utf8', engine='python', 
                 header=None, names=['UserID', 'MovieID', 'Tag', 'Timestamp'])
tags.head()

Unnamed: 0,UserID,MovieID,Tag,Timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835


In [3]:
print('Shape of our tags dataset: {}'.format(tags.shape))
print('Number of unique tags : {}'.format(tags["Tag"].nunique()))
print('Number of null values in dataset:')
print(tags.isna().sum())

Shape of our tags dataset: (95580, 4)
Number of unique tags : 16528
Number of null values in dataset:
UserID        0
MovieID       0
Tag          16
Timestamp     0
dtype: int64


In [4]:
tags = tags.drop(columns='Timestamp')

## Data cleaning
Dropping rows that have null values and preprocessing the tags

In [5]:
tags = tags.drop(tags[tags.isna().sum(axis=1)>0].index)
tags['Tag'] = tags['Tag'].apply(lambda x: re.sub(r'[^a-z0-9]', '', x.strip().lower()))

Dropping rows that have very vague tags. I assume that tags with lengths less than 2 don't have any specific meaning.

In [6]:
tags = tags.drop(tags[tags['Tag'].str.len()<2].index)

Dropping all tags that are only mentioned once

In [7]:
tmp = tags.Tag.value_counts()
tmp = tmp[tmp==1]
tags = tags.drop(tags[tags.Tag.isin(tmp.index)].index)

del tmp

---
## Creating dictionary of tags

In [8]:
idx_tag = {i: x for i,x in enumerate(tags.Tag.value_counts().index)}
tag_idx = {x: i for i,x in enumerate(tags.Tag.value_counts().index)}

In [9]:
data = {'MovieID': [], 'Tags': []}
for movie in tags.MovieID.unique():
    tmp = tags[tags.MovieID==movie]
    tmp_tags = '|'.join(set([str(tag_idx[tag]) for tag in tmp.Tag]))
    data['MovieID'].append(movie)
    data['Tags'].append(tmp_tags)

In [10]:
tags = pd.DataFrame(data=data)

del data, tmp, tmp_tags

tags.head()

Unnamed: 0,MovieID,Tags
0,4973,1749|324|3780|1669|3320|6169|374|920|27|65|655...
1,1747,4067|369|104|4632|172|44|228|1576|185|2817|422...
2,2424,58|121|4294|6111|2296|3895|45|98|60|477|3578|2...
3,2947,306|10|255|77|339|2777|135|32|277|116|831|5|52...
4,3033,107|1693|44|36|374|1324|437|1086|1864|4883|159...


In [11]:
tags.MovieID.min(), tags.MovieID.max(),

(1, 65130)

# Preprocessing `Genre`
## Data loading

In [12]:
genres = pd.read_csv(os.path.join(PATH, 'movies.dat'), sep='::', encoding='utf8', engine='python', 
                 header=None, names=['MovieID', 'Title', 'Genre'])
genres.head()

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [13]:
genres.isna().sum()

MovieID    0
Title      0
Genre      0
dtype: int64

## Data cleaning

In [14]:
u_genre = set()
for g in genres.Genre:
    u_genre = u_genre.union(set(g.split('|')))
print(u_genre)

{'War', 'IMAX', 'Sci-Fi', 'Comedy', 'Western', 'Drama', 'Thriller', 'Musical', 'Film-Noir', 'Romance', 'Animation', 'Fantasy', 'Children', 'Adventure', 'Mystery', '(no genres listed)', 'Horror', 'Documentary', 'Action', 'Crime'}


In [15]:
genres = genres.drop(genres[genres.Genre == '(no genres listed)'].index)

u_genre = set()
for g in genres.Genre:
    u_genre = u_genre.union(set(g.split('|')))
print(u_genre)

{'War', 'IMAX', 'Sci-Fi', 'Comedy', 'Western', 'Drama', 'Thriller', 'Musical', 'Film-Noir', 'Romance', 'Animation', 'Fantasy', 'Children', 'Adventure', 'Mystery', 'Horror', 'Documentary', 'Action', 'Crime'}


---
## Creating dictionary of genre

In [16]:
idx_genre = {i: x for i,x in enumerate(u_genre)}
genre_idx = {x: i for i,x in enumerate(u_genre)}

del u_genre

In [17]:
tmp = []
for genre in genres.Genre:
    l = '|'.join([str(genre_idx[g]) for g in genre.split('|')])
    tmp.append(l)
genres['Genre'] = pd.Series(data = tmp, index=genres.index)

del tmp

In [18]:
genres = genres.drop(columns='Title')
genres.head()

Unnamed: 0,MovieID,Genre
0,1,13|10|12|3|11
1,2,13|12|11
2,3,3|9
3,4,3|5|9
4,5,3


In [19]:
genres.MovieID.min(), genres.MovieID.max()

(1, 65133)

# Preprocessing `Ratings`
## Data loading

In [20]:
ratings = pd.read_csv(os.path.join(PATH, 'ratings.dat'), sep='::', encoding='utf8', engine='python', 
                 header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


## Data cleaning

In [21]:
ratings = ratings.drop(columns='Timestamp')
ratings['Rating'] = ratings['Rating'].apply(lambda x: 1 if x >= 3 else -1)
ratings.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,122,1
1,1,185,1
2,1,231,1
3,1,292,1
4,1,316,1


In [22]:
ratings.UserID.min(), ratings.UserID.max(), ratings.MovieID.min(), ratings.MovieID.max(), 

(1, 71567, 1, 65133)

## Merging data and cleaning

In [23]:
df = pd.merge(ratings, genres, how='left', on=['MovieID'])

del ratings, genres

df.head()

Unnamed: 0,UserID,MovieID,Rating,Genre
0,1,122,1,3|9
1,1,185,1,17|18|6
2,1,231,1,3
3,1,292,1,17|5|2|6
4,1,316,1,17|13|2


In [24]:
df = pd.merge(df, tags, how='left', on=['MovieID'])[['Rating', 'UserID', 'MovieID', 'Genre', 'Tags']]

del tags

df.head()

Unnamed: 0,Rating,UserID,MovieID,Genre,Tags
0,1,1,122,3|9,21|22|2348
1,1,1,185,17|18|6,521|2110|46|1584|963|4017|116|513|6129|5|2646
2,1,1,231,3,107|1028|162|465|374|300|66|556|1597|247|127|2...
3,1,1,292,17|5|2|6,2064|203|2283|44|84|35|230|1303|2187|1318
4,1,1,316,17|13|2,4897|44|6068|1829|27|541|5569|1231|883|35|23|2...


In [25]:
df.isna().sum()

Rating          0
UserID          0
MovieID         0
Genre           7
Tags       328692
dtype: int64

## Final steps
Defining column numbers

In [26]:
idx = 0
user_map_idx = {}
column_map_idx = {}
for i in df.UserID.unique():
    column_map_idx[idx] = ('UserID', i)
    user_map_idx[i] = idx
    idx += 1
movie_map_idx = {}
for i in df.MovieID.unique():
    column_map_idx[idx] = ('MovieID', i)
    movie_map_idx[i] = idx
    idx += 1
tag_map_idx = {}
for i in idx_tag.keys():
    column_map_idx[idx] = ('TagID', i)
    tag_map_idx[i] = idx
    idx += 1
genre_map_idx = {}
for i in idx_genre.keys():
    column_map_idx[idx] = ('GenreID', i)
    genre_map_idx[i] = idx
    idx += 1

So, we have the following list of data:
* `df` - DataFrame that holds all of our data
* `column_map_idx` - mapping of index column to data type and its ID
* `user_map_idx` - mapping of ID of user to index column
* `movie_map_idx` - mapping of ID of movie to index column
* `tag_map_idx` - mapping of ID of tag to index column
* `genre_map_idx` - mapping of ID of genre to index column
* `tag_idx` - mapping of tag name to its ID
* `genre_idx` - mapping of genre name to its ID
* `idx_tag` - mapping of tag ID to its name
* `idx_genre` - mapping of genre ID to its name

## Train, test, validation split

In [27]:
np.random.seed(42)
idx = np.random.permutation(df.shape[0])
idx_train = idx[:round(df.shape[0]*0.8)]
idx_test = idx[round(df.shape[0]*0.8):]

df_train = df.iloc[idx_train]
df_test = df.iloc[idx_test]

df.shape, df_train.shape, df_test.shape

((10000054, 5), (8000043, 5), (2000011, 5))

In [28]:
np.random.seed(42)
idx = np.random.permutation(df_test.shape[0])
idx_valid = idx[round(df_test.shape[0]*0.5):]
idx_test = idx[:round(df_test.shape[0]*0.5)]

df_valid = df_test.iloc[idx_valid]
df_test = df_test.iloc[idx_test]

df_test.shape, df_valid.shape

((1000006, 5), (1000005, 5))

In [43]:
def writeLibfm(df, f):
    for r in df.iterrows():
        r = r[1]
        s = ''
        s += str(r['Rating']) + ' '
        s += str(user_map_idx[r['UserID']]) + ':1 '
        s += str(movie_map_idx[r['MovieID']]) + ':1 '
        if not pd.isna(r['Genre']):
            for g in r['Genre'].split('|'):
                s += str(genre_map_idx[int(g)]) + ':1 '
        if not pd.isna(r['Tags']):
            for t in r['Tags'].split('|'):
                s += str(tag_map_idx[int(t)]) + ':1 '
        s = s.strip() + '\n'
        f.write(s)    

In [46]:
with open(os.path.join(PATH_TO_SAVE, 'movieLens.train.libfm'), 'w', encoding='utf8') as f:
    writeLibfm(df_train, f)

with open(os.path.join(PATH_TO_SAVE, 'movieLens.test.libfm'), 'w', encoding='utf8') as f:
    writeLibfm(df_test, f)
    
with open(os.path.join(PATH_TO_SAVE, 'movieLens.validation.libfm'), 'w', encoding='utf8') as f:
    writeLibfm(df_valid, f)

Saved 3 files should be OK for feeding the NFM machine.