# Hybrid recomendation system

In [41]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from evaluation import  calculate_mse_and_rmse
from converter import to_df

In [42]:
ratings = pd.read_csv('data/ratings.csv', low_memory=False)
ratings = ratings.drop('timestamp', axis=1)
reader = Reader()
ratings_by_users = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
ratings_train, ratings_test = train_test_split(ratings_by_users, test_size=0.2, random_state=42)

In [50]:
ratings = to_df(ratings_train)
test_df = [{'userId': obj[0], 'movieId': obj[1], 'rating': obj[2]} for obj in ratings_test]
test_df = pd.DataFrame(test_df)

In [44]:
movie_metadata = pd.read_csv("data/movie_metadata_soup.csv", low_memory=False)
movie_metadata.head()

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director,soup
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"['animation', 'comedy', 'family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,,Toy Story,False,7.7,5415.0,"['tomhanks', 'timallen', 'donrickles']","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","['jealousy', 'toy', 'boy']",johnlasseter,jealousy toy boy tomhanks timallen donrickles ...
1,1,False,,65000000,"['adventure', 'fantasy', 'family']",,8844,tt0113497,en,Jumanji,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"['robinwilliams', 'jonathanhyde', 'kirstendunst']","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","['boardgame', 'disappearance', ""basedonchildre...",joejohnston,boardgame disappearance basedonchildren'sbook ...
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"['romance', 'comedy']",,15602,tt0113228,en,Grumpier Old Men,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"['waltermatthau', 'jacklemmon', 'ann-margret']","[{'credit_id': '52fe466a9251416c75077a89', 'de...","['fishing', 'bestfriend', 'duringcreditsstinger']",howarddeutch,fishing bestfriend duringcreditsstinger walter...
3,3,False,,16000000,"['comedy', 'drama', 'romance']",,31357,tt0114885,en,Waiting to Exhale,...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"['whitneyhouston', 'angelabassett', 'lorettade...","[{'credit_id': '52fe44779251416c91011acb', 'de...","['basedonnovel', 'interracialrelationship', 's...",forestwhitaker,basedonnovel interracialrelationship singlemot...
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,['comedy'],,11862,tt0113041,en,Father of the Bride Part II,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"['stevemartin', 'dianekeaton', 'martinshort']","[{'credit_id': '52fe44959251416c75039ed7', 'de...","['baby', 'midlifecrisis', 'confidence']",charlesshyer,baby midlifecrisis confidence stevemartin dian...


In [45]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movie_metadata['soup'])
count_matrix.shape

(46488, 73881)

In [46]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim.shape

(46488, 46488)

In [47]:
indices = pd.Series(movie_metadata.index, index=movie_metadata['id'])
indices = indices.sort_values(ascending=False)

In [48]:
def estimate_content_based(user_id, movies):
    user_ratings = pd.merge(ratings[ratings['userId'] == user_id],
                            movie_metadata[['id', 'title']],
                            left_on='movieId',
                            right_on='id',
                            how='inner')
    user_ratings = user_ratings.drop('id', axis=1)
    user_ratings['movieId'] = user_ratings['movieId'].astype(int)

    reviewed_movie_ids = set(user_ratings['movieId'])
    pred_series = []
    for movie_id in movies['id']:
        if movie_id in reviewed_movie_ids:
            continue
        simTotal = weightedSum = 0
        sim_scores_row = list(enumerate(cosine_sim[indices[movie_id]]))
        for reviewed_id in reviewed_movie_ids:
            idx = indices[reviewed_id]
            if len(sim_scores_row) < 46488:
                continue
            if type(idx) != np.int64:
                continue
            sim_score = sim_scores_row[idx]
            simTotal += float(sim_score[1])
            weightedSum += float(sim_score[1]) * float(user_ratings[user_ratings['movieId'] == reviewed_id]['rating'])
        predictedRating = weightedSum / simTotal if simTotal != 0 else 0
        pred_series.append((movie_id, predictedRating))

    pred_series = sorted(pred_series, key=lambda x: x[1], reverse=True)[:10]
    return pred_series

In [ ]:
ratings_test = pd.merge(test_df, movie_metadata[['id']], left_on='movieId', right_on='id', how='inner')
ratings_test.head()

In [52]:
data = []

for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = estimate_content_based(user_id, movie_metadata[movie_metadata['id'] == movie_id])[0][1]
    data.append((rating, predicted))

KeyboardInterrupt: 

In [53]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 1.3904892365443002
RMSE: 1.1791900765119676


In [56]:
svd_model = SVD()
svd_model_trained = svd_model.fit(ratings_train)

In [57]:
data = []
for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est
    data.append((rating, predicted))

In [58]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 0.6468395498227477
RMSE: 0.8042633584981649


In [59]:
def predict_switching(user_id, movie_id, rating):
    num_user_ratings = len(ratings[ratings['userId'] == user_id])
    num_movie_ratings = len(ratings[ratings['movieId'] == movie_id])
    if num_user_ratings > 5 and num_movie_ratings > 50:
        return svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est
    else:
        return estimate_content_based(user_id, movie_metadata[movie_metadata['id'] == movie_id])[0][1]

data = []
for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = predict_switching(user_id, movie_id, rating)
    data.append((rating, predicted))


KeyboardInterrupt: 

In [60]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 0.6688055771183093
RMSE: 0.8178053418254917


In [61]:
def predict_weighted(user_id, movie_id, rating):
    prediction_cf= svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est
    prediction_cn = estimate_content_based(user_id, movie_metadata[movie_metadata['id'] == movie_id])[0][1]
    prediction = 0.5 * prediction_cf + 0.5 * prediction_cn
    return prediction

data = []
for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = predict_switching(user_id, movie_id, rating)
    data.append((rating, predicted))


KeyboardInterrupt



In [62]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 0.6077014267682725
RMSE: 0.7795520680289884
