# Recomendation system
## Content based filtering

In [1]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from evaluation import calculate_mse_and_rmse

In [2]:
ratings = pd.read_csv('data/ratings.csv', low_memory=False)
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [3]:
movie_metadata = pd.read_csv("data/movies_metadata.csv", low_memory=False)
movie_metadata = movie_metadata[['id', 'title']]
movie_metadata = movie_metadata.drop([19730, 29503, 35587])
movie_metadata = movie_metadata[pd.to_numeric(movie_metadata['id'], errors='coerce').notnull()]
movie_metadata = movie_metadata.rename(columns={'id': 'movieId'})
movie_metadata['movieId'] = movie_metadata['movieId'].astype(int)
movie_metadata.head()

Unnamed: 0,movieId,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [4]:
movie_data = ratings.merge(movie_metadata, how='left', on='movieId')
movie_data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,110,1.0,Three Colors: Red
1,1,147,4.5,The 400 Blows
2,1,858,5.0,Sleepless in Seattle
3,1,1221,5.0,
4,1,1246,5.0,Rocky Balboa


In [5]:
df_movies_cnt = pd.DataFrame(ratings.groupby('movieId').size(), columns=['count'])
popular_movies = list(set(df_movies_cnt.query('count >= 0').index))
movies_filter = ratings.movieId.isin(popular_movies).values

df_users_cnt = pd.DataFrame(ratings.groupby('userId').size(), columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 0').index))
users_filter = ratings.userId.isin(active_users).values

df_ratings_filtered = ratings[movies_filter & users_filter]
df_ratings_filtered.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [6]:
reader = Reader()
ratings_by_users = Dataset.load_from_df(df_ratings_filtered[['userId', 'movieId', 'rating']], reader)
train_df, test_df = train_test_split(ratings_by_users, test_size=.2)
test_df.head()

[(143991, 4175, 0.5),
 (253617, 3736, 3.0),
 (15038, 8207, 4.0),
 (72471, 8665, 4.0),
 (144707, 348, 3.0),
 (89604, 3173, 1.0),
 (96504, 7361, 3.5),
 (199144, 3, 3.0),
 (69255, 2841, 4.5),
 (234214, 3489, 3.0),
 (181806, 1682, 4.0),
 (130967, 165, 3.0),
 (102105, 1162, 1.0),
 (121746, 3513, 3.5),
 (217652, 106487, 3.5),
 (64475, 586, 4.0),
 (257017, 1923, 3.5),
 (72503, 726, 4.0),
 (30954, 4866, 4.0),
 (169314, 46976, 4.5),
 (35296, 5445, 3.0),
 (185144, 318, 4.0),
 (117435, 3529, 1.5),
 (114777, 3895, 3.0),
 (79858, 953, 2.0),
 (255449, 163, 4.0),
 (116793, 1797, 4.0),
 (187293, 6874, 3.0),
 (120046, 6377, 3.5),
 (15115, 288, 4.5),
 (150162, 1, 5.0),
 (18660, 2324, 4.0),
 (249658, 356, 4.0),
 (267143, 4223, 4.0),
 (236348, 4020, 4.0),
 (234003, 7090, 5.0),
 (203644, 4641, 4.0),
 (52260, 64197, 4.0),
 (63647, 95441, 3.0),
 (205556, 2137, 4.0),
 (244817, 104, 4.0),
 (140910, 508, 4.0),
 (18985, 33794, 4.0),
 (180262, 1876, 4.0),
 (161539, 187, 4.0),
 (121924, 2713, 1.0),
 (268484, 1057,

In [7]:
svd_model = SVD()
svd_model_trained = svd_model.fit(train_df)

In [8]:
def get_recommendations(user_id):
    pred_series = []
    users_ratings = ratings[ratings['userId'] == user_id]
    for movie_id, name in zip(movie_metadata.index, movie_metadata['title']):
        rating_real = ratings.query(f'movieId == {movie_id}')['rating'].values[0] if movie_id in users_ratings[
            'movieId'].values else 0
        rating_pred = svd_model_trained.predict(user_id, movie_id, rating_real, verbose=False)
        pred_series.append([movie_id, name, rating_pred.est, rating_real])

    df_recommendations = pd.DataFrame(pred_series, columns=['movieId', 'title', 'predicted_rating', 'actual_rating'])
    return df_recommendations.sort_values(by='predicted_rating', ascending=False).head(10)

In [9]:
get_recommendations(222)

Unnamed: 0,movieId,title,predicted_rating,actual_rating
2398,2398,The Ballad of Narayama,4.417505,0.0
3072,3072,Supernova,4.411415,0.0
2424,2424,The Dancemaker,4.382715,0.0
783,783,She's the One,4.290113,2.0
62,62,Don't Be a Menace to South Central While Drink...,4.27222,0.0
1688,1688,As Good as It Gets,4.228205,0.0
1035,1035,Shall We Dance,4.222253,0.0
4664,4664,Father Goose,4.206937,0.0
1907,1907,Babes in Toyland,4.203389,4.0
6753,6753,Lipstick,4.195691,0.0


In [10]:
test_df = [{'userId': obj[0], 'movieId': obj[1], 'rating': obj[2]} for obj in test_df]
test_df = pd.DataFrame(test_df)
test_df.head()

Unnamed: 0,userId,movieId,rating
0,143991,4175,0.5
1,253617,3736,3.0
2,15038,8207,4.0
3,72471,8665,4.0
4,144707,348,3.0


In [11]:
data = []
ratings_test = pd.merge(pd.DataFrame(test_df), movie_metadata[['movieId']], left_on='movieId', right_on='movieId', how='inner')

for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est
    data.append((rating, predicted))

In [12]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 0.6453978130303856
RMSE: 0.803366549608823
