# Movie Recommender

### Importing data

In [14]:
import pandas
from collections import defaultdict
from surprise import accuracy
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

### Reading data
The data used is from [Movie Lens](https://grouplens.org/datasets/movielens/100k/)

In [26]:
DATA_SEPARATOR = '|'
DATA_ENCODING = 'latin-1'
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
items_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
users_data_path = 'data/u.user'
items_data_path = 'data/u.item'
ratings_data_path = 'data/u.data'

items = pandas.read_csv(items_data_path, sep=DATA_SEPARATOR, names=items_cols, encoding=DATA_ENCODING)
ratings = pandas.read_csv(ratings_data_path, sep='\t', names=ratings_cols, encoding=DATA_ENCODING)
ratings.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Obersving data

In [16]:
ratings.describe()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


### Transforming data to surprise's format

In [17]:
reader = Reader(rating_scale=(1, 5))
rating_data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

### Splitting data to train and test sets

In [18]:
trainset, testset = train_test_split(rating_data, test_size=.25)

### Fitting the model

In [19]:
algo = SVD()

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2f435c9cf8>

### Trying it out
Here we can see for which users which movies are recommended

In [20]:
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [21]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [44]:
top_n = get_top_n(predictions, n=10)
user_predictions_ids_df = pandas.DataFrame(top_n[1], columns=['movie_id', 'rating estimation'])
user_predicitons = pandas.merge(user_predictions_ids_df, items, left_on="movie_id", right_on="movie id", how="left")
user_predicitons = user_predicitons[['movie_id', 'movie title', 'release date', 'rating estimation']]
user_predicitons

Unnamed: 0,movie_id,movie title,release date,rating estimation
0,408,"Close Shave, A (1995)",28-Apr-1996,5.0
1,357,One Flew Over the Cuckoo's Nest (1975),01-Jan-1975,5.0
2,515,"Boot, Das (1981)",04-Apr-1997,4.833348
3,285,Secrets & Lies (1996),04-Oct-1996,4.705396
4,178,12 Angry Men (1957),01-Jan-1957,4.701404
5,300,Air Force One (1997),01-Jan-1997,4.687977
6,302,L.A. Confidential (1997),01-Jan-1997,4.67375
7,124,Lone Star (1996),21-Jun-1996,4.650673
8,603,Rear Window (1954),01-Jan-1954,4.583964
9,168,Monty Python and the Holy Grail (1974),01-Jan-1974,4.550565


### Evaluation

In [23]:
pandas.DataFrame(data=cross_validate(SVD(), rating_data, cv=2))

Unnamed: 0,fit_time,test_mae,test_rmse,test_time
0,2.939722,0.760164,0.959587,0.322586
1,2.827434,0.752039,0.953218,0.325236
