# Movies Recommender System

![](http://labs.criteo.com/wp-content/uploads/2017/08/CustomersWhoBought3.jpg)

In [1]:
%matplotlib inline
import surprise
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd.read_csv('input/movies_metadata.csv')
md.head()
md.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [3]:
ratings = pd.read_csv('input/ratings_small.csv')
ratings = ratings.drop(['timestamp'], axis = 1)
ratings.head()




Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [4]:
qualified = pd.read_csv('qualified_movies.csv')
qualified = qualified.set_index('Unnamed: 0')
qualified.head()

Unnamed: 0_level_0,title,year,vote_count,vote_average,popularity,genres,wr
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
15480,Inception,2010,14075,8,29.108149,"['Action', 'Thriller', 'Science Fiction', 'Mys...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"['Drama', 'Action', 'Crime', 'Thriller']",7.905871
22879,Interstellar,2014,11187,8,32.213481,"['Adventure', 'Drama', 'Science Fiction']",7.897107
2843,Fight Club,1999,9678,8,63.869599,['Drama'],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"['Adventure', 'Fantasy', 'Action']",7.871787


In [12]:
print(qualified.index.values)

[15480 12481 22879  2843  4863   292   314  7000   351  5814   256  1225
   834  1154    46 24860   359 18465 22841   586 11354   522 23673   289
  4099  3030  5481  1213  1057  2211  1163  1178    49  1170  2216  4135
  1152 32144  1201   109  1176  1159  1161  1165  9698  2884 10309  9430
  5857  5878  5833 41128   877   732   896   876  1166  1184   926 40251
  5553  1910  3342   883  1132  1236 19901 25465 43190 14551 17818 26564
 20051 23753 26553 18252  2458 12588 37863 19971  1901 26555  1639 10122
 26567 30051  6390  4766 13724 26558 22131 30315 22058 13605 15472 14825
 31865 12704 13746 42170  6232 24455 23359  4756 17437 25534  7725 26562
 23675  5678 19735 14557 23053 26568 21592 10554 40598 16128 21025 11927
 30401  3456 24121 24351 22110 13893     0 11008  8234 17217  1919 41489
 11662  6725 33356 23465 36253   475 25390 24482 26566 19731  1167 15017
 40882 21161 15348  1604 23561 19726 22059 23437 24241 13643  1171 16066
 10839 23692 11567 20922 41492 11316 17588 12595 28

### Top Movies

In [5]:
import re


def get_user_ratings():
    movie_id, rating_int = [],[]
    for index, row in qualified.iterrows():

        movie_title = row['title']
        #TODO fix words
        # viable responses: y, n, m (yes, no, maybe - option for having not seen movie) 
        invalid_input = True
        while invalid_input:
            prompt = 'Rate the following movie on a scale from 0 to 5, if you have seen it - decimals are allowed. Then press enter. If you have not seen the movie, press Enter/Return. Press q if you would like to quit:\n' + movie_title
            user_rating = input(prompt)
            if user_rating.lower() == 'q':
                invalid_input = False
                return movie_id, rating_int
            elif (re.match(r'^-?\d+(?:\.\d+)?$', user_rating) is not None) and (float(user_rating) <= 5 and float(user_rating)>=0):
 
                movie_id.append(index)
                rating_int.append(user_rating)
                invalid_input = False
            else:
                print('ERROR: invalid input')
    
    return movie_id, rating_int

In [7]:
uid = input("If you already have used this service, input your unique user ID and press Enter. If not, type 'n'.")

if uid == 'n':
    uid = max(ratings['userId']) + 1
    print("Your unique user ID is: ", uid)
    
    movie_id, rating_int = get_user_ratings()
    

    ratings = ratings.append(pd.DataFrame({'userId': [uid] * len(movie_id),'movieId':movie_id, 'rating':rating_int}))


else: 
    uid = int(uid)
    #TODO - DO SOMETHING DIFF - ASK FOR RATINGS FOR NA ROWS


check = ratings[ratings['userId'] == uid]
check.tail()
    

If you already have used this service, input your unique user ID and press Enter. If not, type 'n'.n
Your unique user ID is:  673
Rate the following movie on a scale from 0 to 5, if you have seen it - decimals are allowed. Then press enter. If you have not seen the movie, press Enter/Return. Press q if you would like to quit:
Inception1
Rate the following movie on a scale from 0 to 5, if you have seen it - decimals are allowed. Then press enter. If you have not seen the movie, press Enter/Return. Press q if you would like to quit:
The Dark Knight1
Rate the following movie on a scale from 0 to 5, if you have seen it - decimals are allowed. Then press enter. If you have not seen the movie, press Enter/Return. Press q if you would like to quit:
Interstellar1
Rate the following movie on a scale from 0 to 5, if you have seen it - decimals are allowed. Then press enter. If you have not seen the movie, press Enter/Return. Press q if you would like to quit:
Fight Club1
Rate the following movie

Unnamed: 0,movieId,rating,userId
1,12481,1,673
2,22879,1,673
3,2843,1,673
4,4863,1,673
5,292,1,673


In [8]:
# save updated csv
ratings.to_csv('updated_ratings.csv')

In [9]:
#SVD Fitting
group_userId = [672, 673]#, 672]

In [10]:
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'],cv=5)
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13471b400>

In [None]:
# TODOO - DELEETE AFTER OR JUST SAVE OFF AS RATINGS AND REPLACE ONCE CONFIDENT WE WONT FUICK IT UP
ratings = pd.read_csv('updated_ratings.csv')
ratings.head()

In [14]:
movie_ids = set(qualified.index.values.tolist())
print(len(movie_ids))
for uid in group_userId:
    #TODO - make rows for all movies in qualified for each of the users in the group (if they already exist, dont overwrite thoough!)
    user_rated_movies = ratings.loc[lambda ratings: ratings['userId']==uid]['movieId'].values.tolist()
    #print(user_rated_movies)
    #print(uid, set(movie_ids))
    #print(movie_ids)
    print(uid, len(user_rated_movies))

    dummy_row_idx = list(set(movie_ids).difference(set(user_rated_movies)))
     
    #print(dummy_row_idx)
    print(len(dummy_row_idx))
    
    dummy_row_uid = [uid]*len(dummy_row_idx)
    dummy_row_rating = [np.nan]*len(dummy_row_idx)
    ratings = ratings.append(pd.DataFrame({'userId':dummy_row_uid,'movieId':dummy_row_idx, 'rating':dummy_row_rating}))

250
672 6
244
673 6
244


In [None]:
#md[md['title']=='The Dark Knight']

In [None]:
# titles = ['The Dark Knight']#'Inception', , 'Interstellar']
# for ti in titles:
#     print(md.loc[lambda md: md['title']==ti])

In [15]:
ratings['est'] = [np.nan]*len(ratings)
for uid in group_userId:
    #print([svd.predict(uid, movie_id).est for movie_id in movie_ids])
    print(len(ratings.loc[lambda ratings: ratings['userId']==uid,['est']]),len([svd.predict(uid, movie_id).est for movie_id in movie_ids]))
    #ratings[ratings['userId']==uid].sort_values(by='movieId', ascending=True)['est']
    ratings.loc[lambda ratings: ratings['userId']==uid,['est']] = [svd.predict(uid, movie_id).est for movie_id in list(movie_ids)]

print(ratings[ratings['userId'].isin(group_userId)])

250 250
250 250
     movieId rating  userId       est
0      15480      5     672  3.881921
1      12481      5     672  3.881921
2      22879      4     672  3.881921
3       2843      3     672  3.881921
4       4863      5     672  3.881921
5        292      4     672  3.881921
0      15480      1     673  2.560574
1      12481      1     673  2.560574
2      22879      1     673  2.560574
3       2843      1     673  2.560574
4       4863      1     673  2.560574
5        292      1     673  2.560574
0          0    NaN     672  3.881921
1      23553    NaN     672  3.881921
2       4099    NaN     672  3.637955
3      19971    NaN     672  3.881921
4      23557    NaN     672  3.881921
5      19460    NaN     672  3.881921
6      23555    NaN     672  3.881921
7      23561    NaN     672  3.881921
8        522    NaN     672  3.682777
9      23053    NaN     672  3.881921
10     12815    NaN     672  3.881921
11      2576    NaN     672  3.881921
12     41489    NaN     672  4.073

In [16]:
#Ratings that we care about
movie_recs = ratings[ratings['userId'].isin(group_userId)].groupby(["movieId"])['est'].mean().to_frame()

movie_recs = movie_recs.sort_values(by=['est'],ascending=False)

movie_recs['title'] = [qualified.loc[movie_id]['title'] for movie_id in movie_recs.index.tolist()]
print(movie_recs)

              est                                              title
movieId                                                             
926      4.083002                              It's a Wonderful Life
13966    3.932721                                         District 9
475      3.875512                                      Jurassic Park
22718    3.858179                                     The Lego Movie
24121    3.819024                                    The Maze Runner
3030     3.772508                                     The Green Mile
2997     3.767566                                        Toy Story 2
43190    3.691352                                   Band of Brothers
2216     3.650689                                 American History X
10554    3.520772                Harry Potter and the Goblet of Fire
22841    3.479845                           The Grand Budapest Hotel
13060    3.443895                                Slumdog Millionaire
3456     3.441494                 