# Acquire Grouplens movies Dataset
#### from grouplens websites [here](https://grouplens.org/datasets/movielens/latest/)

In [1]:
import urllib.request
urllib.request.urlretrieve("https://files.grouplens.org/datasets/movielens/ml-latest.zip", "dataset.zip") # download file from url
!unzip dataset.zip # extract files from zipfile

Archive:  dataset.zip
   creating: ml-latest/
  inflating: ml-latest/tags.csv      
  inflating: ml-latest/links.csv     
  inflating: ml-latest/README.txt    
  inflating: ml-latest/ratings.csv   
  inflating: ml-latest/genome-tags.csv  
  inflating: ml-latest/genome-scores.csv  
  inflating: ml-latest/movies.csv    


#### Import necessary packages

In [2]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
dataset_path = "ml-latest/"
import warnings; warnings.simplefilter('ignore')

In [3]:
reader = Reader() # for Data structure
ratings = pd.read_csv(dataset_path+'ratings.csv', usecols=[ 'userId', "movieId", "rating" ])
movies = pd.read_csv( dataset_path+"/movies.csv" ).set_index("movieId")
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,110,4.0
2,1,158,4.0
3,1,260,4.5
4,1,356,5.0


In [4]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) 

#### Model Selection 

In [5]:
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBasic

parameters ={
        "n_factors":20, "n_epochs":20,"reg_all":0.005,"random_state":11
    }
# for i in [ 0.005, 0.006, 0.007, 0.008, 0.009, 0.01 ]:
#     print(i)
#     parameters["reg_all"] = i

svd = SVD(**parameters)
print(cross_validate(svd, data,measures=['RMSE'], n_jobs = 1)['test_rmse'].mean())


0.7901703773759757


In [6]:
svd = SVD(**parameters) # prediction model 
trainset = data.build_full_trainset() 
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a79cdc29d50>

In [7]:
import json
# movie dataset id to trained inner movie dataset index store
with open( "movie_id_to_vector_id.json", "w" ) as f:
    json.dump(trainset._raw2inner_id_items, f)

# trained movies vector for prediction store
np.save( "movie_vectors.npy", svd.qi.astype(np.float32) )

# top movies based number of rating by
top_movies = pd.read_csv( "ml-latest/ratings.csv", usecols=[ "userId", "movieId" ] )
top_movies = top_movies.groupby( "movieId" ).count(  )
top_movies = top_movies.sort_values("userId", ascending=False).head( 100 ).index.values
np.save( "top_movies.npy", top_movies.astype(np.int32) )

In [8]:
from scipy.spatial.distance import cosine

import numpy as np
import pandas as pd
import json

max_display_movies = 10
movies = pd.read_csv( "ml-latest/movies.csv" ).set_index( "movieId" )
vectors = np.load( "movie_vectors.npy" )
mapping_id = None
with open( "movie_id_to_vector_id.json", "r" ) as f:
    mapping_id = { int(k):int(v) for k,v in json.load( f ).items()}
top_movies = np.load( "top_movies.npy" )[:max_display_movies]

reverse_mapping_id = { v:k for k, v in mapping_id.items() }


def get_cosine_distance( target ):
    distances = []
    for x in vectors:
        distances.append( cosine( target, x ) )
    return np.array( distances )

def get_list( viewed ):
    if len(viewed) == 0:
        return top_movies.tolist()
    cummalate = np.zeros( vectors.shape[0], dtype = np.float32 )
    viewed = [ mapping_id[x] for x in viewed ]
    for target in viewed:
        cummalate += get_cosine_distance( vectors[target]  )
    viewed = set(viewed)
    return [reverse_mapping_id[x] for x in np.argsort( cummalate ) if x not in viewed ][:max_display_movies]

In [9]:
[movies.loc[x] for x in get_list( [ 1, 6377 ] )]

[title                           Monsters, Inc. (2001)
 genres    Adventure|Animation|Children|Comedy|Fantasy
 Name: 4886, dtype: object,
 title                              Toy Story 2 (1999)
 genres    Adventure|Animation|Children|Comedy|Fantasy
 Name: 3114, dtype: object,
 title           Ratatouille (2007)
 genres    Animation|Children|Drama
 Name: 50872, dtype: object,
 title                                   Toy Story 3 (2010)
 genres    Adventure|Animation|Children|Comedy|Fantasy|IMAX
 Name: 78499, dtype: object,
 title                    Bug's Life, A (1998)
 genres    Adventure|Animation|Children|Comedy
 Name: 2355, dtype: object,
 title                              Up (2009)
 genres    Adventure|Animation|Children|Drama
 Name: 68954, dtype: object,
 title                        Incredibles, The (2004)
 genres    Action|Adventure|Animation|Children|Comedy
 Name: 8961, dtype: object,
 title                  Partly Cloudy (2009)
 genres    Animation|Children|Comedy|Fantasy
 Name

In [10]:
import os, shutil
os.remove( "dataset.zip" )
shutil.rmtree( "ml-latest" )