# Recomendation system
## Content based filtering

In [1]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from evaluation import calculate_mse_and_rmse

In [2]:
ratings = pd.read_csv('data/ratings.csv', low_memory=False)
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [3]:
movie_metadata = pd.read_csv("data/movies_metadata.csv", low_memory=False)
movie_metadata = movie_metadata[['id', 'title']]
movie_metadata = movie_metadata.drop([19730, 29503, 35587])
movie_metadata = movie_metadata[pd.to_numeric(movie_metadata['id'], errors='coerce').notnull()]
movie_metadata = movie_metadata.rename(columns={'id': 'movieId'})
movie_metadata['movieId'] = movie_metadata['movieId'].astype(int)
movie_metadata.head()

Unnamed: 0,movieId,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [4]:
movie_data = ratings.merge(movie_metadata, how='left', on='movieId')
movie_data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,110,1.0,Three Colors: Red
1,1,147,4.5,The 400 Blows
2,1,858,5.0,Sleepless in Seattle
3,1,1221,5.0,
4,1,1246,5.0,Rocky Balboa


In [5]:
df_movies_cnt = pd.DataFrame(ratings.groupby('movieId').size(), columns=['count'])
popular_movies = list(set(df_movies_cnt.query('count >= 15').index))
movies_filter = ratings.movieId.isin(popular_movies).values

df_users_cnt = pd.DataFrame(ratings.groupby('userId').size(), columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 5').index))
users_filter = ratings.userId.isin(active_users).values

df_ratings_filtered = ratings[movies_filter & users_filter]
df_ratings_filtered.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [6]:
reader = Reader()
ratings_by_users = Dataset.load_from_df(df_ratings_filtered[['userId', 'movieId', 'rating']], reader)
train_df, test_df = train_test_split(ratings_by_users, test_size=.2)
test_df

[(192300, 5292, 3.5),
 (227233, 109487, 4.0),
 (255023, 39, 2.5),
 (65645, 282, 3.0),
 (270684, 2414, 3.0),
 (72694, 1214, 4.0),
 (182803, 5367, 2.5),
 (246564, 47, 4.0),
 (30485, 80906, 2.0),
 (124674, 3077, 5.0),
 (37068, 1307, 4.0),
 (27799, 3147, 3.5),
 (19708, 3916, 4.5),
 (112108, 551, 4.0),
 (83888, 910, 4.5),
 (244115, 24, 3.5),
 (62930, 2496, 2.5),
 (256688, 2046, 2.0),
 (55847, 535, 4.0),
 (111255, 1250, 4.0),
 (141841, 71838, 2.5),
 (8659, 4281, 3.0),
 (21766, 2132, 5.0),
 (53227, 502, 1.0),
 (98787, 113, 2.5),
 (119449, 3821, 1.5),
 (26315, 1037, 2.0),
 (135559, 1321, 3.0),
 (115269, 1, 3.0),
 (26929, 50872, 3.5),
 (95549, 58627, 4.0),
 (11212, 3018, 3.0),
 (97887, 356, 4.0),
 (203706, 4857, 3.0),
 (26053, 1721, 3.0),
 (124450, 413, 2.5),
 (79640, 1645, 4.0),
 (8233, 2948, 4.0),
 (126986, 143245, 4.0),
 (214587, 5222, 4.0),
 (146395, 93297, 5.0),
 (24858, 3, 3.0),
 (175298, 2924, 3.5),
 (152185, 165, 4.0),
 (237200, 3101, 5.0),
 (232629, 1207, 5.0),
 (237132, 69306, 4.0),
 

In [7]:
svd_model = SVD()
svd_model_trained = svd_model.fit(train_df)

In [8]:
def get_recommendations(user_id):
    pred_series = []
    users_ratings = ratings[ratings['userId'] == user_id]
    for movie_id, name in zip(movie_metadata.index, movie_metadata['title']):
        rating_real = ratings.query(f'movieId == {movie_id}')['rating'].values[0] if movie_id in users_ratings[
            'movieId'].values else 0
        rating_pred = svd_model_trained.predict(user_id, movie_id, rating_real, verbose=False)
        pred_series.append([movie_id, name, rating_pred.est, rating_real])

    df_recommendations = pd.DataFrame(pred_series, columns=['movieId', 'title', 'predicted_rating', 'actual_rating'])
    return df_recommendations.sort_values(by='predicted_rating', ascending=False).head(10)

In [9]:
get_recommendations(222)

Unnamed: 0,movieId,title,predicted_rating,actual_rating
1721,1721,The Big One,4.661155,0.0
62,62,Don't Be a Menace to South Central While Drink...,4.444837,0.0
587,587,Snow White and the Seven Dwarfs,4.439476,0.0
3247,3247,The Devil's Brigade,4.378068,0.0
2431,2431,Relax... It's Just Sex,4.335528,0.0
2,2,Grumpier Old Men,4.291734,0.0
500,500,North,4.28584,0.0
2012,2012,Children of the Corn,4.241431,0.0
2424,2424,The Dancemaker,4.24119,0.0
168,168,Jeffrey,4.181322,0.0


In [10]:
test_df = [{'userId': obj[0], 'movieId': obj[1], 'rating': obj[2]} for obj in test_df]
test_df = pd.DataFrame(test_df)
test_df.head()

Unnamed: 0,userId,movieId,rating
0,192300,5292,3.5
1,227233,109487,4.0
2,255023,39,2.5
3,65645,282,3.0
4,270684,2414,3.0


In [11]:
data = []
ratings_test = pd.merge(pd.DataFrame(test_df), movie_metadata[['movieId']], left_on='movieId', right_on='movieId', how='inner')

for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est
    data.append((rating, predicted))

In [12]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 0.6461272233805871
RMSE: 0.8038203924886375
