In [1]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>")) 

In [2]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 999

## Import PMF class

In [3]:
cwd = os.getcwd()
path = os.path.join(cwd, '..', 'src')
if not path in sys.path:
    sys.path.append(path)
del cwd, path

In [4]:
from MovieRecommender import PMF

## Load ratings data

In [5]:
cwd = os.getcwd()
ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings_filtered.csv"))
ratings.drop('timestamp', axis=1, inplace=True)

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,5,2,3.0
2,13,2,3.0
3,29,2,3.0
4,34,2,3.0


## Define Ryan's userid

In [7]:
max_userId = ratings['userId'].max()
ryan_id = max_userId + 1
print('Ryan userid: ' + str(ryan_id))

Ryan userid: 138494


## Load Ryan's ratings profile and union it with the ratings data

In [8]:
ryan = pd.read_csv('Ryans_Movie_Ratings_complete.csv')

In [9]:
ryan.head(10)

Unnamed: 0,rating,title
0,,EDtv (1999)
1,,I'll Be Home For Christmas (1998)
2,,Hollow Man (2000)
3,,Primary Colors (1998)
4,,Not Without My Daughter (1991)
5,,Urbania (2000)
6,4.0,Whip It (2009)
7,,Chaos (2005)
8,,Thank You for Smoking (2006)
9,,Dobermann (1997)


In [10]:
cwd = os.getcwd()
movies = pd.read_csv(os.path.join(cwd, "..", "data", "movies.csv"))

In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
ryan.shape

(4055, 2)

In [13]:
ryan.replace(to_replace=['', ' ', 'NaN'], value=np.nan, inplace=True)
ryan.dropna(subset=['rating'], inplace=True)
ryan.value_counts()

In [14]:
ryan.shape

(372, 2)

In [15]:
ryan = pd.merge(ryan, movies[['movieId', 'title']], on='title')

In [16]:
ryan.shape

(373, 3)

In [17]:
ryan.head()

Unnamed: 0,rating,title,movieId
0,4,Whip It (2009),71518
1,5,Daybreakers (2010),73268
2,3,Austin Powers in Goldmember (2002),5481
3,3,Final Destination 3 (2006),43679
4,3,"Fast and the Furious, The (2001)",4369


In [18]:
ryan[ryan.duplicated(subset=['title'], keep=False)]

Unnamed: 0,rating,title,movieId
64,4,Aladdin (1992),588
65,4,Aladdin (1992),114240


In [19]:
ryan = ryan.query('movieId != 114240')

In [20]:
ryan.shape

(372, 3)

In [21]:
ryan.drop(['title'], inplace=True, axis=1)

In [22]:
ryan.head()

Unnamed: 0,rating,movieId
0,4,71518
1,5,73268
2,3,5481
3,3,43679
4,3,4369


In [23]:
ryan.shape

(372, 2)

In [24]:
ryan['userId'] = ryan_id
ryan = ryan[['userId', 'movieId', 'rating']]

In [25]:
ryan.head()

Unnamed: 0,userId,movieId,rating
0,138494,71518,4
1,138494,73268,5
2,138494,5481,3
3,138494,43679,3
4,138494,4369,3


In [26]:
ryan.duplicated(subset=['movieId'], keep=False).sum()

0

### Make sure Ryan's movieId's are in the training set

In [32]:
ryan['movieId'].isin(ratings['movieId'].tolist()).sum()

372

### Continue

In [33]:
ratings.shape[0] + ryan.shape[0]

19706653

In [34]:
ratings_combined = pd.concat([ratings, ryan])

In [35]:
ratings_combined.shape

(19706653, 3)

In [36]:
ratings_combined.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,5,2,3.0
2,13,2,3.0
3,29,2,3.0
4,34,2,3.0


In [37]:
ratings_combined.isnull().sum()

userId     0
movieId    0
rating     0
dtype: int64

## Fit PMF using all data, including Ryan's ratings

In [38]:
pmf = PMF(rank=10, lamd=7.5, sig2=0.5, verbose=True)

In [39]:
pmf.fit(ratings_combined)

force_refresh is set to: True
persisting user mapping
persisting movie mapping
building omega | 2017-08-25 12:53:29.127686
building omega_u | 2017-08-25 12:53:53.385816
iteration 0 of 138494 | 2017-08-25 12:53:53.385816
iteration 1000 of 138494 | 2017-08-25 12:59:49.058750
iteration 2000 of 138494 | 2017-08-25 13:05:42.412880
iteration 3000 of 138494 | 2017-08-25 13:11:43.260539
iteration 4000 of 138494 | 2017-08-25 13:17:28.794088
iteration 5000 of 138494 | 2017-08-25 13:23:11.647975


KeyboardInterrupt: 

## Persist model

In [26]:
pickle.dump(pmf, open('ryan_pmf_model_trained.pkl', "wb"))