In [2]:
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import train_test_split
import pandas as pd

In [3]:
movies = pd.read_csv('./podaci/movie.csv')
ratings = pd.read_csv('./podaci/ratings_small.csv')

In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [5]:
# Podijela podataka na skup za treniranje i skup za testiranje
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [6]:
# Inicijalizacija i treniranje SVD modela
svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1dcaed74eb0>

In [7]:
# Inicijalizacija i treniranje KNN modela
knn_model = KNNBasic(sim_options={'user_based': True})  # user_based postavljen na True za sustav preporuke temeljen na korisnicima
knn_model.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1dcaed75c90>

In [8]:
# Funkcija za preporuku filmova na temelju treniranog SVD modela
def recommend_movies_svd_surprise(user_id, model, top_n=10):
    # Dohvaćanje filmova koje korisnik nije ocijenio
    user_rated_movies = set(ratings[ratings['userId'] == user_id]['movieId'])
    all_movies = set(ratings['movieId'])
    unrated_movies = list(all_movies - user_rated_movies)

    # Generiranje predikcija za sve neocijenjene filmove
    predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movies]

    # Sortiranje predikcija prema ocjenama
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

    # Stvaranje DataFrame-a s preporučenim filmovima
    recommended_movies_info = pd.DataFrame(columns=['movieId', 'title', 'genres', 'est'])

    for prediction in sorted_predictions[:top_n]:
        movie_info = movies[movies['movieId'] == prediction.iid][['movieId', 'title', 'genres']]
        movie_info['est'] = prediction.est
        recommended_movies_info = pd.concat([recommended_movies_info, movie_info], ignore_index=True)

    return recommended_movies_info[['movieId', 'title', 'genres', 'est']]

In [9]:
# Funkcija za preporuku filmova na temelju treniranog KNN modela
def recommend_movies_knn_surprise(user_id, model, top_n=10):
    # Dohvaćanje filmova koje korisnik nije ocijenio
    user_rated_movies = set(ratings[ratings['userId'] == user_id]['movieId'])
    all_movies = set(ratings['movieId'])
    unrated_movies = list(all_movies - user_rated_movies)

    # Generiranje preporuka za korisnika
    recommendations = model.get_neighbors(user_id, k=top_n)

    # Prikazivanje informacija o preporučenim filmovima
    recommended_movies_info = movies[movies['movieId'].isin(recommendations)][['movieId', 'title', 'genres']]

    return recommended_movies_info

In [10]:
# Primjer korištenja
user_id_to_recommend = 600  # Postavite željeni ID korisnika

In [35]:
recommendations_svd_surprise = recommend_movies_svd_surprise(user_id_to_recommend, svd_model)
recommendations_data_svd = pd.DataFrame(recommendations_svd_surprise)

  recommended_movies_info = pd.concat([recommended_movies_info, movie_info], ignore_index=True)


In [36]:
recommendations_knn_surprise = recommend_movies_knn_surprise(user_id_to_recommend, knn_model)
recommendations_data_knn = pd.DataFrame(recommendations_knn_surprise)

In [37]:
# Ispis preporučenih filmova
def display_recommendations(recommendations_data):
  for _, row in recommendations_data.iterrows():
      movie_id = int(row['movieId'])
      title = row['title']
      genres = row['genres']
      print(f"ID: {movie_id}, Naslov: {title} ({genres})")

In [38]:
display_recommendations(recommendations_data_svd)

ID: 6016, Naslov: City of God (Cidade de Deus) (2002) (Action|Adventure|Crime|Drama|Thriller)
ID: 1221, Naslov: Godfather: Part II, The (1974) (Crime|Drama)
ID: 2542, Naslov: Lock, Stock & Two Smoking Barrels (1998) (Comedy|Crime|Thriller)
ID: 318, Naslov: Shawshank Redemption, The (1994) (Crime|Drama)
ID: 858, Naslov: Godfather, The (1972) (Crime|Drama)
ID: 1212, Naslov: Third Man, The (1949) (Film-Noir|Mystery|Thriller)
ID: 50, Naslov: Usual Suspects, The (1995) (Crime|Mystery|Thriller)
ID: 1213, Naslov: Goodfellas (1990) (Crime|Drama)
ID: 2329, Naslov: American History X (1998) (Crime|Drama)
ID: 922, Naslov: Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) (Drama|Film-Noir|Romance)


In [39]:
display_recommendations(recommendations_data_knn)

ID: 6, Naslov: Heat (1995) (Action|Crime|Thriller)
ID: 68, Naslov: French Twist (Gazon maudit) (1995) (Comedy|Romance)
ID: 108, Naslov: Catwalk (1996) (Documentary)
ID: 136, Naslov: From the Journals of Jean Seberg (1995) (Documentary)
ID: 186, Naslov: Nine Months (1995) (Comedy|Romance)
ID: 203, Naslov: To Wong Foo, Thanks for Everything! Julie Newmar (1995) (Comedy)
ID: 217, Naslov: Babysitter, The (1995) (Drama|Thriller)
ID: 263, Naslov: Ladybird Ladybird (1994) (Drama)
ID: 274, Naslov: Man of the House (1995) (Comedy)
ID: 279, Naslov: My Family (1995) (Drama)


In [43]:
from sklearn.metrics import mean_squared_error
from surprise.model_selection import cross_validate
import numpy as np

In [44]:
# Funkcija za preporuku filmova na temelju treniranog modela
def recommend_and_evaluate(model, user_id, top_n=10):
    # Preporuka
    recommendations = []
    if isinstance(model, SVD):
        recommendations = recommend_movies_svd_surprise(user_id, model, top_n)
    elif isinstance(model, KNNBasic):
        recommendations = recommend_movies_knn_surprise(user_id, model, top_n)
    else:
        raise ValueError("Nepodržani model")

    # Evaluacija unakrsnom validacijom
    results = cross_validate(model, data, measures=['MAE', 'RMSE'], cv=5, verbose=False)

    # Ispis rezultata unakrsne validacije
    print(f'Rezultati unakrsne validacije za model {model.__class__.__name__}:')
    print(f"MAE: {np.mean(results['test_mae']):.4f}")
    print(f"RMSE: {np.mean(results['test_rmse']):.4f}")

    # Evaluacija preporuka
    predictions = [model.predict(user_id, movie_id).est for movie_id in recommendations['movieId']]
    true_ratings = [
        float(ratings.loc[(ratings['userId'] == user_id) & (ratings['movieId'] == movie_id), 'rating'].iloc[0])
        if ratings[(ratings['userId'] == user_id) & (ratings['movieId'] == movie_id)].shape[0] > 0
        else 0  # Ako nema ocjene, postavi na 0
        for movie_id in recommendations['movieId']
    ]

    # Ispis evaluacije preporuka
    mse = mean_squared_error(true_ratings, predictions)
    print(f'MSE za korisnika {user_id}: {mse:.4f}')

In [45]:
# Unakrsna validacija za SVD model
recommend_and_evaluate(svd_model, user_id_to_recommend)

  recommended_movies_info = pd.concat([recommended_movies_info, movie_info], ignore_index=True)


Rezultati unakrsne validacije za model SVD:
MAE: 0.6912
RMSE: 0.8977
MSE za korisnika 600: 19.4272


In [46]:
# Unakrsna validacija za KNN model
recommend_and_evaluate(knn_model, user_id_to_recommend)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Rezultati unakrsne validacije za model KNNBasic:
MAE: 0.7438
RMSE: 0.9680
MSE za korisnika 600: 9.7546


In [47]:
from surprise import KNNWithMeans
from surprise import accuracy

In [48]:
movie_df = pd.read_csv('./podaci/movie.csv')

# k Nearest Neighbour
similarity = {
    "name": "cosine",
    "user_based": False,  # item-based similarity
}
algo_KNN = KNNWithMeans(sim_options = similarity)

# SVD
algo_SVD = SVD()

In [49]:
movie_rating_set = pd.crosstab(index = ratings.userId, columns = ratings.movieId, values = ratings.rating, aggfunc = np.mean)

  movie_rating_set = pd.crosstab(index = ratings.userId, columns = ratings.movieId, values = ratings.rating, aggfunc = np.mean)


In [50]:
# load df into Surprise Reader object
reader = Reader(rating_scale = (0,5))
rating_df = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [51]:
cross_validate_KNN = cross_validate(algo_KNN, rating_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9318  0.9302  0.9201  0.9243  0.9281  0.9269  0.0042  
MAE (testset)     0.7146  0.7127  0.7052  0.7068  0.7111  0.7101  0.0035  
Fit time          5.32    5.25    6.10    5.60    5.64    5.58    0.30    
Test time         4.22    4.23    4.59    4.47    4.34    4.37    0.14    


In [52]:
cross_validate_SVD = cross_validate(algo_SVD, rating_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8962  0.9143  0.8875  0.8872  0.9019  0.8974  0.0101  
MAE (testset)     0.6903  0.7029  0.6854  0.6833  0.6948  0.6913  0.0070  
Fit time          0.89    0.89    0.96    0.85    0.83    0.88    0.05    
Test time         0.13    0.18    0.12    0.10    0.09    0.13    0.03    


In [53]:
# define train test function
def train_test_algo(algo, label):
    training_set, testing_set = train_test_split(rating_df, test_size = 0.2)
    algo.fit(training_set)
    test_output = algo.test(testing_set)
    test_df = pd.DataFrame(test_output)
    
    print("RMSE -",label, accuracy.rmse(test_output, verbose = False))
    print("MAE -", label, accuracy.mae(test_output, verbose=False))
    print("MSE -", label, accuracy.mse(test_output, verbose=False))
    
    return test_df

In [54]:
train_test_KNN = train_test_algo(algo_KNN, "algo_KNN")
print(train_test_KNN.head())

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE - algo_KNN 0.9306864728287642
MAE - algo_KNN 0.7122355378981291
MSE - algo_KNN 0.866177310706446
   uid    iid  r_ui       est                                    details
0  547  39307   2.5  3.829056  {'actual_k': 40, 'was_impossible': False}
1  452   3809   3.0  2.725402  {'actual_k': 40, 'was_impossible': False}
2  102   1036   4.0  4.199759  {'actual_k': 40, 'was_impossible': False}
3  184    364   5.0  3.871193  {'actual_k': 38, 'was_impossible': False}
4  472   7070   3.0  4.645791  {'actual_k': 40, 'was_impossible': False}


In [55]:
train_test_SVD = train_test_algo(algo_SVD, "algo_SVD")
print(train_test_SVD.head())

RMSE - algo_SVD 0.8912553725652782
MAE - algo_SVD 0.6878460760914464
MSE - algo_SVD 0.7943361391264729
   uid   iid  r_ui       est                    details
0   30  2611   5.0  3.455180  {'was_impossible': False}
1  268  1517   3.5  3.761342  {'was_impossible': False}
2  640  1476   3.0  3.773878  {'was_impossible': False}
3  547  1282   5.0  3.720801  {'was_impossible': False}
4   56  3079   4.0  3.715933  {'was_impossible': False}


In [56]:
def prediction(algo, users_K):
    pred_list = []
    for userId in range(1,users_K):
        for movieId in range(1,9067):
            rating = algo.predict(userId, movieId).est
            pred_list.append([userId, movieId, rating])
    pred_df = pd.DataFrame(pred_list, columns = ['userId', 'movieId', 'rating'])
    return pred_df

In [57]:
def top_recommendations(pred_df, top_N):
    recommended_movie = pd.merge(pred_df, movie_df, how='inner', left_on='movieId', right_on='movieId')
    sorted_df = recommended_movie.groupby(('userId'), as_index = False).apply(lambda x: x.sort_values(['rating'], ascending = False)).reset_index(drop=True)
    top_recommended_movies = sorted_df.groupby('userId').head(top_N)
    return sorted_df, top_recommended_movies

In [62]:
pred_SVD = prediction(algo_SVD, 5)
recommended_movies_SVD, top_recommended_movies_SVD = top_recommendations(pred_SVD, 5)

In [63]:
top_recommended_movies_SVD

Unnamed: 0,userId,movieId,rating,title,genres
0,1,318,3.743975,"Shawshank Redemption, The (1994)",Crime|Drama
1,1,50,3.630215,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
2,1,4235,3.617579,Amores Perros (Love's a Bitch) (2000),Drama|Thriller
3,1,969,3.587613,"African Queen, The (1951)",Adventure|Comedy|Romance|War
4,1,1198,3.584458,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
8337,2,1136,4.590324,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
8338,2,1221,4.537162,"Godfather: Part II, The (1974)",Crime|Drama
8339,2,969,4.517903,"African Queen, The (1951)",Adventure|Comedy|Romance|War
8340,2,4235,4.516731,Amores Perros (Love's a Bitch) (2000),Drama|Thriller
8341,2,858,4.504115,"Godfather, The (1972)",Crime|Drama


In [64]:
pred_KNN = prediction(algo_KNN, 5)
recommended_movies_KNN, top_recommended_movies_KNN = top_recommendations(pred_KNN, 5)

In [65]:
top_recommended_movies_KNN

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1819,5.0,Storefront Hitchcock (1997),Documentary|Musical
1,1,6769,5.0,Mambo Italiano (2003),Comedy
2,1,8699,5.0,Dancing in September (2000),Drama
3,1,6369,5.0,Friends and Family (2001),Comedy
4,1,5229,5.0,I Think I Do (1997),Comedy
8337,2,3757,5.0,Asylum (1972),Horror
8338,2,4189,5.0,"Greatest Story Ever Told, The (1965)",Drama
8339,2,2627,5.0,Endurance (1999),Documentary|Drama
8340,2,2636,5.0,"Mummy's Ghost, The (1944)",Horror
8341,2,2647,5.0,House of Frankenstein (1944),Horror
