# Movie Recommender

### Importing data

In [85]:
import pandas
from collections import defaultdict
from surprise import accuracy
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

### Reading data
The data used is from [Movie Lens](https://grouplens.org/datasets/movielens/100k/)

In [48]:
DATA_SEPARATOR = '|'
DATA_ENCODING = 'latin-1'
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
items_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
users_data_path = 'data/u.user'
items_data_path = 'data/u.item'
ratings_data_path = 'data/u.data'

users = pandas.read_csv(users_data_path, sep=DATA_SEPARATOR, names=users_cols, encoding=DATA_ENCODING)
items = pandas.read_csv(items_data_path, sep=DATA_SEPARATOR, names=items_cols, encoding=DATA_ENCODING)
ratings = pandas.read_csv(ratings_data_path, sep='\t', names=ratings_cols, encoding=DATA_ENCODING)
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


### Obersving data

### Transforming data to surprise's format

In [55]:
reader = Reader(rating_scale=(1, 5))
rating_data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

{'fit_time': (2.635237216949463, 2.623155117034912),
 'test_mae': array([0.75810748, 0.75655998]),
 'test_rmse': array([0.9595054 , 0.95724903]),
 'test_time': (0.366255521774292, 0.3633718490600586)}

### Splitting data to train and test sets

In [57]:
trainset, testset = train_test_split(data, test_size=.25)

### Fitting the model

In [92]:
algo = SVD()

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f58f74acc18>

### Trying it out
Here we can see for which users which movies are recommended

In [87]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

testset = trainset.build_anti_testset()
predictions = algo.test(testset)
top_n = get_top_n(predictions, n=10)
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

184 [408, 59, 654, 178, 474, 302, 60, 169, 519, 12]
393 [114, 50, 318, 408, 178, 483, 59, 190, 1039, 174]
811 [511, 313, 357, 318, 474, 483, 408, 12, 169, 178]
404 [64, 318, 483, 174, 520, 12, 657, 515, 265, 651]
830 [169, 12, 318, 272, 64, 408, 114, 357, 923, 515]
21 [603, 357, 483, 178, 12, 64, 318, 23, 187, 480]
711 [654, 657, 285, 100, 178, 603, 114, 498, 127, 302]
286 [178, 251, 12, 515, 318, 69, 320, 479, 638, 963]
934 [318, 408, 511, 484, 114, 357, 12, 483, 79, 479]
305 [515, 513, 923, 657, 114, 124, 528, 498, 529, 1194]
298 [12, 64, 603, 169, 408, 57, 181, 114, 657, 190]
686 [408, 169, 488, 657, 174, 511, 190, 285, 480, 498]
919 [603, 100, 23, 98, 408, 483, 12, 606, 480, 64]
144 [64, 427, 318, 511, 168, 657, 513, 272, 408, 515]
798 [408, 318, 96, 114, 64, 430, 479, 169, 923, 483]
757 [169, 318, 408, 114, 483, 427, 603, 12, 178, 480]
409 [408, 169, 50, 64, 313, 646, 655, 963, 272, 316]
233 [657, 427, 648, 408, 178, 190, 480, 98, 513, 96]
309 [408, 12, 114, 174, 173, 318, 22, 169

### Evaluation

In [91]:
cross_validate(SVD(), data, cv=2)

{'fit_time': (2.755220890045166, 2.693763256072998),
 'test_mae': array([0.75820279, 0.75583506]),
 'test_rmse': array([0.95973016, 0.95635902]),
 'test_time': (0.350313663482666, 0.3062303066253662)}