In [1]:
!pip install numpy
!pip install pandas
!pip install matplotlib

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Preprocessing

In [3]:
data = pd.read_csv("../data/raw/ml-100k/u.data", sep = "\t", names = ["u_id", "i_id", "rating", "timestamp"])
data.head()

Unnamed: 0,u_id,i_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
# Timestamp does not seem relevant in context of movie recommendation
data = data.drop("timestamp", axis=1)
data.head()

Unnamed: 0,u_id,i_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
genres = pd.read_csv("../data/raw/ml-100k/u.genre", sep = "|", names=["genre", "id"])
genres_list = genres['genre'].to_list()
cols = ["id", "title", "release_date", "video_release_date", "imdb_url"]
cols = cols + genres_list
cols

['id',
 'title',
 'release_date',
 'video_release_date',
 'imdb_url',
 'unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [6]:
items = pd.read_csv("../data/raw/ml-100k/u.item", encoding = "ISO-8859-1", sep = "|", names = cols)
items = items.rename(columns={"id": "i_id"})
items.head()

Unnamed: 0,i_id,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
items = items.drop(["release_date", "video_release_date", "imdb_url"], axis=1)
items.head()

Unnamed: 0,i_id,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
#user id, age, gender, occupation, zip code.
user = pd.read_csv("../data/raw/ml-100k/u.user", sep = "|", names = ["u_id", "age", "gender", "occupation", "zip_code"])
user

Unnamed: 0,u_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [9]:
user = user.drop("zip_code", axis=1)
user = pd.get_dummies(user)
user.set_index("u_id")
user.head()

Unnamed: 0,u_id,age,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,1,24,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,2,53,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,3,23,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,4,24,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,5,33,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [10]:
ua_base = pd.read_csv("../data/raw/ml-100k/ua.base", sep = "\t", names = ["u_id", "i_id", "rating", "timestamp"])
ua_base = ua_base.drop("timestamp", axis=1)
ua_base

Unnamed: 0,u_id,i_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3
...,...,...,...
90565,943,1047,2
90566,943,1074,4
90567,943,1188,3
90568,943,1228,3


In [11]:
ua_test = pd.read_csv("../data/raw/ml-100k/ua.test", sep = "\t", names = ["u_id", "i_id", "rating", "timestamp"])
ua_test = ua_test.drop("timestamp", axis=1)
ua_test

Unnamed: 0,u_id,i_id,rating
0,1,20,4
1,1,33,4
2,1,61,4
3,1,117,3
4,1,155,2
...,...,...,...
9425,943,232,4
9426,943,356,4
9427,943,570,1
9428,943,808,4


# Models

## funk-svd

In [12]:
!pip install git+https://github.com/gbolmier/funk-svd

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting git+https://github.com/gbolmier/funk-svd
  Cloning https://github.com/gbolmier/funk-svd to c:\users\root\appdata\local\temp\pip-req-build-cftfksm9
  Resolved https://github.com/gbolmier/funk-svd to commit fecc38ea1c2859ef6a6d9af0b7f953e1b693764e
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/gbolmier/funk-svd 'C:\Users\root\AppData\Local\Temp\pip-req-build-cftfksm9'


In [13]:
import multiprocessing as mp
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from funk_svd.dataset import fetch_ml_ratings
from funk_svd.utils import _timer
from funk_svd import SVD

In [14]:
svd = SVD(lr=0.001, reg=0.005, n_epochs=100, n_factors=15,
          early_stopping=False, shuffle=False, min_rating=1, max_rating=5)

svd.fit(X=ua_base, X_val=ua_test)

Preprocessing data...

Preprocessing data...

Epoch 1/100  | val_loss: 1.18 - val_rmse: 1.09 - val_mae: 0.91 - took 1.0 sec
Epoch 2/100  | val_loss: 1.13 - val_rmse: 1.06 - val_mae: 0.88 - took 0.0 sec
Epoch 3/100  | val_loss: 1.10 - val_rmse: 1.05 - val_mae: 0.86 - took 0.0 sec
Epoch 4/100  | val_loss: 1.07 - val_rmse: 1.04 - val_mae: 0.84 - took 0.0 sec
Epoch 5/100  | val_loss: 1.05 - val_rmse: 1.03 - val_mae: 0.83 - took 0.0 sec
Epoch 6/100  | val_loss: 1.04 - val_rmse: 1.02 - val_mae: 0.82 - took 0.0 sec
Epoch 7/100  | val_loss: 1.03 - val_rmse: 1.01 - val_mae: 0.82 - took 0.0 sec
Epoch 8/100  | val_loss: 1.02 - val_rmse: 1.01 - val_mae: 0.81 - took 0.0 sec
Epoch 9/100  | val_loss: 1.01 - val_rmse: 1.00 - val_mae: 0.81 - took 0.0 sec
Epoch 10/100 | val_loss: 1.00 - val_rmse: 1.00 - val_mae: 0.80 - took 0.0 sec
Epoch 11/100 | val_loss: 1.00 - val_rmse: 1.00 - val_mae: 0.80 - took 0.0 sec
Epoch 12/100 | val_loss: 0.99 - val_rmse: 1.00 - val_mae: 0.80 - took 0.0 sec
Epoch 13/100 | val

<funk_svd.svd.SVD at 0x25889549940>

In [15]:
user = 1

df_for_user = data.loc[data['u_id'] == user]

film_ids = items['i_id'].to_list()
film_ids = [id for id in film_ids if id not in df_for_user["i_id"].to_list()]

full_df_for_user = pd.DataFrame({
    'u_id': [user] * len(film_ids),
    'i_id': film_ids
})

In [16]:
pred = svd.predict(df_for_user)
df_for_user['pred'] = pred

n = 5
# Best n films that user watched that are recommended
best_watched_ids = df_for_user.sort_values(by='pred', ascending = False).head(n)['i_id'].to_list()
best_watched_films = items[items['i_id'].isin(best_watched_ids)]
best_watched_films

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_for_user['pred'] = pred


Unnamed: 0,i_id,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
8,9,Dead Man Walking (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,12,"Usual Suspects, The (1995)",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
49,50,Star Wars (1977),0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
133,134,Citizen Kane (1941),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
167,168,Monty Python and the Holy Grail (1974),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
pred = svd.predict(full_df_for_user)
full_df_for_user['pred'] = pred

best_ids = full_df_for_user.sort_values(by='pred', ascending = False).head(n)[['i_id', 'pred']]
best_ids.head(n)

Unnamed: 0,i_id,pred
238,511,4.691626
45,318,4.668463
210,483,4.586712
12,285,4.562561
330,603,4.552981


In [18]:
best_films = pd.merge(items, best_ids, how = 'inner', on = ['i_id'])[['title', 'pred']]
best_films

Unnamed: 0,title,pred
0,Secrets & Lies (1996),4.562561
1,Schindler's List (1993),4.668463
2,Casablanca (1942),4.586712
3,Lawrence of Arabia (1962),4.691626
4,Rear Window (1954),4.552981
