In [53]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [54]:
movies = pd.read_csv('./podaci/movie.csv')

movies['title'] = movies['title'].apply(lambda x: x if pd.isna(x) else (str(x).split('(')[0].strip()))

# Pretvaranje NaN vrednosti u prazan string
movies['genres'] = movies['genres'].fillna('')

# Kombinovanje relevantnih informacija u zasebnu kolonu
movies['combined'] = movies['title'] + ' ' + movies['genres']

In [55]:
# Inicijalizacija TF-IDF vektora
tfidf_vectorizer = TfidfVectorizer(stop_words='english', analyzer='word', ngram_range=(1,2))
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['combined'])

In [56]:
# Računanje sličnosti između filmova pomoću kosinusne sličnosti
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [57]:
# Funkcija za generisanje preporuka
def get_recommendations(movie_title, cosine_similarities=cosine_similarities):
    movie_index = movies.index[movies['title'] == movie_title].tolist()[0]
    similar_movies = list(enumerate(cosine_similarities[movie_index]))
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    similar_movies = similar_movies[1:11]  # Prvih 10 sličnih filmova (ignorišee se sam film)

    recommended_movies = [(movies.iloc[i[0]]['title'], movies.iloc[i[0]]['genres']) for i in similar_movies]

    return recommended_movies

In [58]:
# Primjer: Generisanje preporuka za film "Contact"
movie_title = "Contact"
print(movies[movies.title.eq(movie_title)])

      movieId    title        genres              combined
1532     1584  Contact  Drama|Sci-Fi  Contact Drama|Sci-Fi


In [59]:
recommendations = get_recommendations(movie_title)

print(f"Preporuke za film '{movie_title}':")
for i, (title, genres) in enumerate(recommendations):
    print(f"{i + 1}. {title} ({genres})")

Preporuke za film 'Contact':
1. V (Drama|Sci-Fi)
2. It's Me, It's Me (Comedy|Drama|Sci-Fi)
3. Love (Drama|Sci-Fi)
4. Last Night (Drama|Sci-Fi)
5. Day After, The (Drama|Sci-Fi)
6. Everything I Can See From Here (Adventure|Animation|Drama|Sci-Fi)
7. Day, The (Drama|Sci-Fi|Thriller)
8. Making Contact (Fantasy|Horror|Sci-Fi)
9. Beyond the Stars (Drama|Sci-Fi)
10. Face of Another, The (Drama|Sci-Fi)


In [32]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [33]:
movies = pd.read_csv('./podaci/movie.csv')
ratings = pd.read_csv('./podaci/ratings_small.csv')

In [34]:
# Odabir ID-a korisnika
user_id = 1

In [35]:
movies['title'] = movies['title'].apply(lambda x: x if pd.isna(x) else (str(x).split('(')[0].strip()))
user_ratings = ratings[ratings['userId'] == user_id]

In [36]:
rated_movies = pd.merge(user_ratings, movies, on='movieId')

# Filtriranje filmova koje je korisnik ocijenio
user_rated_movies = rated_movies[(rated_movies['userId'] == user_id) & (rated_movies['rating'] > 3)]

In [37]:
# Ako korisnik nema ocjene, koriste se svi žanrovi iz dataseta
if user_rated_movies.empty:
    user_genres = '|'.join(movies['genres'].unique())
else:
    user_genres = '|'.join(user_rated_movies['genres'].unique())

In [38]:
# Razdvajanje žanrova u pojedinačne riječi
user_genres_list = user_genres.split('|')

# Kombinovanje preferiranih žanrova i ocjena za korisnika
movies['genres'] = movies['genres'].astype(str)
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
movies['user_combined'] = movies['genres']

In [39]:
# for index, row in user_rated_movies.iterrows():
#     # Odbacivanje filmova sa niskim ocenama (npr ocjene ispod 3.5)
#     if row['rating'] > 3:
#       movies.at[row['movieId'], 'user_combined'] += f" {row['rating']}"

# movies.to_csv('./podaci/movie_3.csv', index=False)

In [40]:
# Inicijalizacija TF-IDF vektora
tfidf_vectorizer = TfidfVectorizer(stop_words='english', analyzer='word')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['user_combined'])

In [41]:
feature_names = tfidf_vectorizer.get_feature_names_out()
print("Riječnik sa indeksima:")
print(feature_names, sep=", ")

Riječnik sa indeksima:
['action' 'adventure' 'animation' 'children' 'comedy' 'crime'
 'documentary' 'drama' 'fantasy' 'fi' 'film' 'genres' 'horror' 'imax'
 'listed' 'musical' 'mystery' 'noir' 'romance' 'sci' 'thriller' 'war'
 'western']


In [42]:
# Računanje sličnosti između filmova pomoću kosinusne sličnosti
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [43]:
# Funkcija za generiranje preporuka
def get_movie_recommendations(user_id):
    # Filmovi koje je korisnik već ocijenio
    user_movies_ratings = user_rated_movies['title'].tolist()
    print(f"Visoko ocijenjeni filmovi korisnika ID {user_id}:")
    print(user_movies_ratings, sep=", ")

    recommended_movies = []

    # Računanje srednjih vrijednosti ocjena po žanru za korisnika
    genre_avg_ratings = user_rated_movies.groupby('genres')['rating'].mean()

    # Iteriranje kroz ocijenjene filmove korisnika
    for movie_title in user_movies_ratings:
        movie_index = movies.index[movies['title'] == movie_title].tolist()[0]
        sim_scores = list(enumerate(cosine_similarities[movie_index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]  # Top 10 sličnih filmova (prvi će biti sam film)

        # Računanje ponderisane sličnosti srednjom vrijednosti ocjene po žanru
        weighted_sim_scores = []
        for i in range(len(sim_scores)):
            genre = movies['genres'].iloc[sim_scores[i][0]]
            genre_avg_rating = genre_avg_ratings.get(genre, 0)
            weighted_similarity = sim_scores[i][1] * genre_avg_rating
            weighted_sim_scores.append((sim_scores[i][0], weighted_similarity))

        weighted_sim_scores = sorted(weighted_sim_scores, key=lambda x: x[1], reverse=True)

        # Dodavanje preporučenih filmova koji nisu ocijenjeni
        for i in range(len(weighted_sim_scores)):
            recommended_movie_title = movies['title'].iloc[weighted_sim_scores[i][0]]
            recommended_movie_genres = movies['genres'].iloc[sim_scores[i][0]]
            if recommended_movie_title not in user_rated_movies:
                recommended_movies.append((recommended_movie_title, recommended_movie_genres))
    
    return recommended_movies[:10]  # Vracanje prvih 10 preporučenih filmova

In [44]:
# Generiranje preporuka za određenog korisnika
recommended_movies = get_movie_recommendations(user_id)

print(f"Preporuke za korisnika sa ID {user_id}:")
for i, (title, genres) in enumerate(recommended_movies):
    print(f"{i + 1}. {title} ({genres})")

Visoko ocijenjeni filmovi korisnika ID 1:
['Cinema Paradiso', 'Dracula', 'French Connection, The', 'Tron']
Preporuke za korisnika sa ID 1:
1. Othello (Drama)
2. Dangerous Minds (Drama)
3. Cry, the Beloved Country (Drama)
4. Restoration (Drama)
5. Georgia (Drama)
6. Home for the Holidays (Drama)
7. Mr. Holland's Opus (Drama)
8. Two Bits (Drama)
9. Journey of August King, The (Drama)
10. Margaret's Museum (Drama)
