# Recomendation system
## Content based filtering

In [1]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split

In [2]:
ratings = pd.read_csv('data/ratings.csv', low_memory=False)
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [3]:
movie_metadata = pd.read_csv("data/movies_metadata.csv", low_memory=False)
movie_metadata = movie_metadata[['id', 'title']]
movie_metadata = movie_metadata.drop([19730, 29503, 35587])
movie_metadata =movie_metadata[pd.to_numeric(movie_metadata['id'], errors='coerce').notnull()]
movie_metadata=movie_metadata.rename(columns = {'id':'movieId'})
movie_metadata['movieId'] = movie_metadata['movieId'].astype(int)
movie_metadata.head()

Unnamed: 0,movieId,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [4]:
movie_data = ratings.merge(movie_metadata, how='left', on='movieId')
movie_data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,110,1.0,Three Colors: Red
1,1,147,4.5,The 400 Blows
2,1,858,5.0,Sleepless in Seattle
3,1,1221,5.0,
4,1,1246,5.0,Rocky Balboa


In [5]:
df_movies_cnt = pd.DataFrame(ratings.groupby('movieId').size(), columns=['count'])
popular_movies = list(set(df_movies_cnt.query('count >= 150').index))
movies_filter = ratings.movieId.isin(popular_movies).values

df_users_cnt = pd.DataFrame(ratings.groupby('userId').size(),columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 15').index))
users_filter = ratings.userId.isin(active_users).values

df_ratings_filtered = ratings[movies_filter & users_filter]
df_ratings_filtered.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [6]:
reader = Reader()
ratings_by_users = Dataset.load_from_df(df_ratings_filtered[['userId', 'movieId', 'rating']], reader)
train_df, test_df = train_test_split(ratings_by_users, test_size=.2)
test_df

[(119834, 74, 3.0),
 (83353, 76210, 4.0),
 (126278, 785, 4.0),
 (144369, 111, 5.0),
 (194307, 3693, 3.0),
 (39204, 5388, 4.0),
 (143650, 595, 2.0),
 (124756, 350, 4.0),
 (211020, 3350, 4.5),
 (205016, 88140, 4.0),
 (154011, 2110, 3.0),
 (238653, 955, 5.0),
 (224977, 832, 3.0),
 (104387, 2001, 4.0),
 (38580, 2194, 3.0),
 (67557, 73321, 4.5),
 (57329, 4896, 4.0),
 (3452, 5899, 4.0),
 (99369, 1356, 4.0),
 (89821, 1059, 5.0),
 (133703, 1208, 5.0),
 (39601, 2716, 4.0),
 (6357, 6863, 4.5),
 (8039, 153, 4.0),
 (3125, 1653, 4.0),
 (168554, 3578, 3.5),
 (226855, 378, 4.0),
 (207416, 6886, 5.0),
 (190553, 3753, 4.0),
 (163719, 3552, 2.0),
 (96247, 3052, 4.5),
 (165651, 1784, 5.0),
 (86659, 45720, 4.0),
 (267495, 1387, 2.5),
 (24455, 3698, 0.5),
 (33182, 116897, 4.0),
 (176959, 367, 4.0),
 (259047, 2321, 4.0),
 (53249, 442, 3.0),
 (149494, 26082, 4.5),
 (249997, 1175, 4.0),
 (222305, 3107, 3.0),
 (205361, 610, 3.0),
 (179760, 3076, 4.0),
 (252513, 1580, 4.0),
 (248380, 64957, 5.0),
 (23626, 185, 

In [7]:
svd_model = SVD()
svd_model_trained = svd_model.fit(train_df)

In [8]:
def get_recommendations(user_id):
    pred_series= []
    users_ratings = ratings[ratings['userId'] == user_id]
    for movie_id, name in zip(movie_metadata.index, movie_metadata['title']):
        rating_real = ratings.query(f'movieId == {movie_id}')['rating'].values[0] if movie_id in users_ratings['movieId'].values else 0
        rating_pred = svd_model_trained.predict(user_id, movie_id, rating_real, verbose=False)
        pred_series.append([movie_id, name, rating_pred.est, rating_real])

    # print the results
    df_recommendations = pd.DataFrame(pred_series, columns=['movieId', 'title', 'predicted_rating', 'actual_rating'])
    return df_recommendations.sort_values(by='predicted_rating', ascending=False).head(10)

In [9]:
get_recommendations(678)

Unnamed: 0,movieId,title,predicted_rating,actual_rating
318,318,Swimming with Sharks,4.420327,0.0
527,527,Serial Mom,4.231642,0.0
1704,1704,Wild Things,4.220411,0.0
50,50,Guardian Angel,4.218498,0.0
858,858,Bogus,4.215867,0.0
2329,2329,Hilary and Jackie,4.214371,0.0
2324,2324,Wilde,4.213289,0.0
1221,1221,Unforgiven,4.211962,0.0
44552,44555,The Dancer,4.210034,0.0
5618,5618,Heartbeeps,4.209434,0.0
