# Hybrid recomendation system

In [37]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from evaluation import calculate_mse_and_rmse
from converter import to_df

In [38]:
ratings = pd.read_csv('data/ratings_small.csv', low_memory=False)
ratings = ratings.drop('timestamp', axis=1)
reader = Reader()
ratings_by_users = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
ratings_train, ratings_test = train_test_split(ratings_by_users, test_size=0.2, random_state=42)

In [39]:
ratings = to_df(ratings_train)
test_df = [{'userId': obj[0], 'movieId': obj[1], 'rating': obj[2]} for obj in ratings_test]
test_df = pd.DataFrame(test_df)

In [40]:
movie_metadata = pd.read_csv("data/movie_metadata_soup.csv", low_memory=False)
movie_metadata.head()

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director,soup
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"['animation', 'comedy', 'family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,,Toy Story,False,7.7,5415.0,"['tomhanks', 'timallen', 'donrickles']","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","['jealousy', 'toy', 'boy']",johnlasseter,jealousy toy boy tomhanks timallen donrickles ...
1,1,False,,65000000,"['adventure', 'fantasy', 'family']",,8844,tt0113497,en,Jumanji,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"['robinwilliams', 'jonathanhyde', 'kirstendunst']","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","['boardgame', 'disappearance', ""basedonchildre...",joejohnston,boardgame disappearance basedonchildren'sbook ...
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"['romance', 'comedy']",,15602,tt0113228,en,Grumpier Old Men,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"['waltermatthau', 'jacklemmon', 'ann-margret']","[{'credit_id': '52fe466a9251416c75077a89', 'de...","['fishing', 'bestfriend', 'duringcreditsstinger']",howarddeutch,fishing bestfriend duringcreditsstinger walter...
3,3,False,,16000000,"['comedy', 'drama', 'romance']",,31357,tt0114885,en,Waiting to Exhale,...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"['whitneyhouston', 'angelabassett', 'lorettade...","[{'credit_id': '52fe44779251416c91011acb', 'de...","['basedonnovel', 'interracialrelationship', 's...",forestwhitaker,basedonnovel interracialrelationship singlemot...
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,['comedy'],,11862,tt0113041,en,Father of the Bride Part II,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"['stevemartin', 'dianekeaton', 'martinshort']","[{'credit_id': '52fe44959251416c75039ed7', 'de...","['baby', 'midlifecrisis', 'confidence']",charlesshyer,baby midlifecrisis confidence stevemartin dian...


In [41]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movie_metadata['soup'])
count_matrix.shape

(46488, 73881)

In [42]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim.shape

(46488, 46488)

In [43]:
indices = pd.Series(movie_metadata.index, index=movie_metadata['id'])
indices = indices.sort_values(ascending=False)

In [33]:
def estimate_content_based(user_id, movies):
    user_ratings = pd.merge(ratings[ratings['userId'] == user_id],
                            movie_metadata[['id', 'title']],
                            left_on='movieId',
                            right_on='id',
                            how='inner')
    user_ratings = user_ratings.drop('id', axis=1)
    user_ratings['movieId'] = user_ratings['movieId'].astype(int)

    reviewed_movie_ids = set(user_ratings['movieId'])
    pred_series = []
    for movie_id in movies['id']:
        if movie_id in reviewed_movie_ids:
            continue
        simTotal = weightedSum = 0
        sim_scores_row = list(enumerate(cosine_sim[indices[movie_id]]))
        for reviewed_id in reviewed_movie_ids:
            idx = indices[reviewed_id]
            if len(sim_scores_row) < 46488:
                continue
            if type(idx) != np.int64:
                continue
            sim_score = sim_scores_row[idx]
            simTotal += float(sim_score[1])
            weightedSum += float(sim_score[1]) * float(user_ratings[user_ratings['movieId'] == reviewed_id]['rating'])
        predictedRating = weightedSum / simTotal if simTotal != 0 else 0
        pred_series.append((movie_id, predictedRating))

    pred_series = sorted(pred_series, key=lambda x: x[1], reverse=True)[:10]
    return pred_series

In [44]:
ratings_test = pd.merge(test_df, movie_metadata[['id']], left_on='movieId', right_on='id', how='inner')
ratings_test.head()

Unnamed: 0,userId,movieId,rating,id
0,664,3081,4.0,3081
1,472,3081,4.0,3081
2,461,3081,2.5,3081
3,212,3081,4.0,3081
4,656,3081,4.0,3081


In [45]:
svd_model = SVD()
svd_model_trained = svd_model.fit(ratings_train)

In [36]:
def predict_switching(user_id, movie_id, rating):
    num_user_ratings = len(ratings[ratings['userId'] == user_id])
    num_movie_ratings = len(ratings[ratings['movieId'] == movie_id])
    if num_user_ratings > 5 and num_movie_ratings > 30:
        return svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est
    else:
        return estimate_content_based(user_id, movie_metadata[movie_metadata['id'] == movie_id])[0][1]


data = []
for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = predict_switching(user_id, movie_id, rating)
    data.append((rating, predicted))


KeyboardInterrupt: 

In [12]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 0.9363272887437245
RMSE: 0.9676400615640738


In [15]:
def predict_weighted(user_id, movie_id, rating):
    prediction_cf = svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est
    prediction_cn = estimate_content_based(user_id, movie_metadata[movie_metadata['id'] == movie_id])[0][1]
    prediction = 0.6 * prediction_cf + 0.4 * prediction_cn
    return prediction


data = []
for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = predict_switching(user_id, movie_id, rating)
    data.append((rating, predicted))

In [22]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

NameError: name 'data' is not defined

In [24]:
for x in range(0, 21):
    data = []
    coef1 = x * 5 / 100
    coef2 = 1 - coef1
    for index, row in ratings_test.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        rating = row['rating']
        prediction_cf = svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est
        prediction_cn = estimate_content_based(user_id, movie_metadata[movie_metadata['id'] == movie_id])[0][1]
        prediction = coef1 * prediction_cf + coef2 * prediction_cn
        data.append((rating, prediction))
    mse, rmse = calculate_mse_and_rmse(data)
    del data
    print("Coefficient:", coef1)
    print("MSE:", mse)
    print("RMSE:", rmse)

Coefficient: 0.0
MSE: 1.133211195319508
RMSE: 1.064523928955807
Coefficient: 0.05
MSE: 1.1000802167359647
RMSE: 1.0488470893013742
Coefficient: 0.1
MSE: 1.0687029697955646
RMSE: 1.0337809099589548
Coefficient: 0.15
MSE: 1.0390794544982984
RMSE: 1.0193524682357415
Coefficient: 0.2
MSE: 1.0112096708441796
RMSE: 1.0055892157557078
Coefficient: 0.25
MSE: 0.985093618833202
RMSE: 0.9925188254301286
Coefficient: 0.3
MSE: 0.9607312984653646
RMSE: 0.9801690152546981
Coefficient: 0.35
MSE: 0.9381227097406685
RMSE: 0.9685673490990022
Coefficient: 0.4
MSE: 0.9172678526591115
RMSE: 0.9577410154416023
Coefficient: 0.45
MSE: 0.898166727220696
RMSE: 0.94771658591622
Coefficient: 0.5
MSE: 0.8808193334254255
RMSE: 0.9385197565450742
Coefficient: 0.55
MSE: 0.8652256712732921
RMSE: 0.9301750756031318
Coefficient: 0.6
MSE: 0.8513857407642995
RMSE: 0.9227056631257334
Coefficient: 0.65
MSE: 0.8392995418984506
RMSE: 0.916132928072368
Coefficient: 0.7
MSE: 0.8289670746757478
RMSE: 0.9104762900129513
Coefficien

In [25]:
for x in range(0, 21):
    data = []
    coef1 = x * 5
    coef2 = x * 10
    for index, row in ratings_test.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        rating = row['rating']
        num_user_ratings = len(ratings[ratings['userId'] == user_id])
        num_movie_ratings = len(ratings[ratings['movieId'] == movie_id])
        if num_user_ratings > coef1 and num_movie_ratings > coef2:
            prediction = svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est
        else:
            prediction = estimate_content_based(user_id, movie_metadata[movie_metadata['id'] == movie_id])[0][1]
        data.append((rating, prediction))
    mse, rmse = calculate_mse_and_rmse(data)
    del data
    print("Coefficient1:", coef1)
    print("Coefficient2:", coef2)
    print("MSE:", mse)
    print("RMSE:", rmse)

Coefficient1: 0
Coefficient2: 0
MSE: 0.8192163749385658
RMSE: 0.9051057258345933
Coefficient1: 5
Coefficient2: 10
MSE: 0.8683959333913128
RMSE: 0.9318776386368077
Coefficient1: 10
Coefficient2: 20
MSE: 0.9015369851167367
RMSE: 0.949493014780381
Coefficient1: 15
Coefficient2: 30
MSE: 0.940388748980133
RMSE: 0.9697364327383668
Coefficient1: 20
Coefficient2: 40
MSE: 0.9982928750190972
RMSE: 0.9991460729138144
Coefficient1: 25
Coefficient2: 50
MSE: 1.0187169376271015
RMSE: 1.009315083423953
Coefficient1: 30
Coefficient2: 60
MSE: 1.0402195898066988
RMSE: 1.0199115597965829
Coefficient1: 35
Coefficient2: 70
MSE: 1.0561119064738105
RMSE: 1.027673054270574
Coefficient1: 40
Coefficient2: 80
MSE: 1.0717821946933574
RMSE: 1.0352691411866566
Coefficient1: 45
Coefficient2: 90
MSE: 1.090215180210747
RMSE: 1.0441336984365301
Coefficient1: 50
Coefficient2: 100
MSE: 1.0976285633650382
RMSE: 1.0476777001373268
Coefficient1: 55
Coefficient2: 110
MSE: 1.1041370445473284
RMSE: 1.0507792558607771
Coefficien

In [46]:
def estimate_augment(user_id, movies):
    user_ratings = pd.merge(ratings[ratings['userId'] == user_id],
                            movie_metadata[['id', 'title']],
                            left_on='movieId',
                            right_on='id',
                            how='inner')
    user_ratings = user_ratings.drop('id', axis=1)
    user_ratings['movieId'] = user_ratings['movieId'].astype(int)

    reviewed_movie_ids = set(user_ratings['movieId'])
    pred_series = []
    for movie_id in movies['id']:
        if movie_id in reviewed_movie_ids:
            continue
        simTotal = weightedSum = 0
        sim_scores_row = list(enumerate(cosine_sim[indices[movie_id]]))
        for reviewed_id in reviewed_movie_ids:
            idx = indices[reviewed_id]
            if len(sim_scores_row) < 46488:
                continue
            if type(idx) != np.int64:
                continue
            sim_score = sim_scores_row[idx]
            simTotal += float(sim_score[1])
            weightedSum += float(sim_score[1]) * float(user_ratings[user_ratings['movieId'] == reviewed_id]['rating'])
        predictedRating = weightedSum / simTotal if simTotal != 0 else 0
        predictedRating = (predictedRating + svd_model_trained.predict(user_id, movie_id, rating, verbose=False).est) / 2
        pred_series.append((movie_id, predictedRating))

    pred_series = sorted(pred_series, key=lambda x: x[1], reverse=True)[:10]
    return pred_series

In [47]:
data = []
ratings_test = pd.merge(ratings_test, movie_metadata[['id']], left_on='movieId', right_on='id', how='inner')

for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = estimate_augment(user_id, movie_metadata[movie_metadata['id'] == movie_id])[0][1]
    data.append((rating, predicted))

In [48]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 0.8862322669774335
RMSE: 0.9413991007948932


In [ ]:
user_ids = ratings_test['userId'].unique()
ratings_test = pd.merge(ratings_test, movie_metadata[['movieId']], left_on='movieId', right_on='movieId', how='inner')
total, hit = 0, 0
for user_id in user_ids:
    first_row = get_recommendations(user_id).iloc[0]
    predicted_rating = first_row['predicted_rating']
    total = total + 1
    if predicted_rating > 3.5:
        hit = hit + 1
print("HitRatio: ", hit / total)
print("Hit:", hit)
print("Total: ", total)