In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Učitavanje MovieLens dataset-a
movies = pd.read_csv('./podaci/movie.csv')

# Pretvaranje NaN vrednosti u prazan string
movies['genres'] = movies['genres'].fillna('')

# Kombinovanje relevantnih informacija u jedinstveni tekstualni sadržaj
movies['combined'] = movies['title'] + ' ' + movies['genres']

# Inicijalizacija TF-IDF vektora
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['combined'])

# Računanje sličnosti između filmova pomoću kosinusne sličnosti
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# Funkcija za generisanje preporuka
def get_recommendations(movie_title, cosine_similarities=cosine_similarities):
    movie_index = movies.index[movies['title'] == movie_title].tolist()[0]
    similar_movies = list(enumerate(cosine_similarities[movie_index]))
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    similar_movies = similar_movies[1:11]  # Prvih 10 sličnih filmova (ignorišemo sam film)

    recommended_movies = [movies.iloc[i[0]]['title'] for i in similar_movies]

    return recommended_movies

# Primer: Generisanje preporuka za film "Toy Story (1995)"
movie_title = "Toy Story (1995)"
recommendations = get_recommendations(movie_title)

print(f"Preporuke za film '{movie_title}':")
for i, title in enumerate(recommendations):
    print(f"{i + 1}. {title}")


Preporuke za film 'Toy Story (1995)':
1. Toy Story 2 (1999)
2. Toy Story 3 (2010)
3. Toy Story of Terror (2013)
4. Toy Story That Time Forgot (2014)
5. Toy Story Toons: Small Fry (2011)
6. Toy Story Toons: Hawaiian Vacation (2011)
7. Tin Toy (1988)
8. Toy, The (1982)
9. Toy Story Toons: Partysaurus Rex (2012)
10. Christmas Toy, The (1986)


In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Učitavanje MovieLens dataset-a
movies = pd.read_csv('./podaci/movie.csv')
ratings = pd.read_csv('./podaci/ratings_small.csv')

# Koristi regularni izraz da izdvoji samo naziv filma
movies['title'] = movies['title'].apply(lambda x: x if pd.isna(x) else (str(x).split('(')[0].strip()))

# Povezivanje ocena sa informacijama o filmovima
rated_movies = pd.merge(ratings, movies, on='movieId')

# Pretpostavljamo da korisnik ima neki ID, npr. userID = 1
user_id = 1

# Filtriranje filmova koje je korisnik ocenio
user_rated_movies = rated_movies[rated_movies['userId'] == user_id]

# Ako korisnik nema ocene, možemo koristiti sve žanrove iz dataset-a
if user_rated_movies.empty:
    user_genres = '|'.join(movies['genres'].unique())
else:
    user_genres = '|'.join(user_rated_movies['genres'].unique())

# Razdvajanje žanrova u pojedinačne reči
user_genres_list = user_genres.split('|')

# Kombinovanje preferiranih žanrova i ocena za korisnika
movies['genres'] = movies['genres'].astype(str)
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
movies['user_combined'] = movies['genres']

for index, row in user_rated_movies.iterrows():
    # Odbacivanje filmova sa niskim ocenama (na primer, ocena ispod 3.5)
    if row['rating'] >= 3.5:
      movies.at[row['movieId'], 'user_combined'] += f" {row['rating']}"

# movies.to_csv('./podaci/movie_3.csv', index=False)
      
# Inicijalizacija TF-IDF vektora
tfidf_vectorizer = TfidfVectorizer(stop_words='english', analyzer='word')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['user_combined'])

# feature_names = tfidf_vectorizer.get_feature_names_out()
# print("Rečnik sa indeksima:")
# for i in range(len(feature_names)):
#     print(f"{i}: {feature_names[i]}")

# Računanje sličnosti između filmova pomoću kosinusne sličnosti
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# Funkcija za generisanje preporuka za korisnika
# def get_user_recommendations(user_id, cosine_similarities=cosine_similarities):
#     # Filtriranje filmova koje korisnik nije ocenio
#     user_not_rated_movies = movies[~movies['movieId'].isin(user_rated_movies['movieId'])]
    
#     # Za svaki film koji korisnik nije ocenio, računamo sličnost sa ocenjenim filmovima
#     user_similarities = []
#     for index, row in user_not_rated_movies.iterrows():
#         movie_index = row.name  # Koristimo indeks reda umesto 'movieId' kako bismo izbegli problem sa neslaganjem indeksa
#         similarity_sum = cosine_similarities[movie_index, :].sum()
#         user_similarities.append((movie_index, similarity_sum))

#     # Sortiramo filmove prema sličnosti
#     user_similarities = sorted(user_similarities, key=lambda x: x[1], reverse=True)
    
#     # Prikazujemo preporuke
#     recommended_movies = [movies.loc[i]['title'] for i, _ in user_similarities[:10]]
    
#     return recommended_movies
def get_user_recommendations(user_id, cosine_similarities=cosine_similarities):
    # Filtriranje filmova koje korisnik nije ocenio
    user_not_rated_movies = movies[~movies['movieId'].isin(user_rated_movies['movieId'])]
    
    # Za svaki film koji korisnik nije ocenio, računamo sličnost sa ocenjenim filmovima
    user_similarities = []
    for index, row in user_not_rated_movies.iterrows():
        movie_index = row.name  # Koristimo indeks reda umesto 'movieId' kako bismo izbegli problem sa neslaganjem indeksa
        similarity_sum = cosine_similarities[movie_index, :].sum()
        user_similarities.append((movie_index, similarity_sum))

    # Sortiramo filmove prema sličnosti
    user_similarities = sorted(user_similarities, key=lambda x: x[1], reverse=True)
    return user_similarities

# Primer: Generisanje preporuka za korisnika sa ID 1
user_recommendations = get_user_recommendations(user_id)
user_rated_movies_list = user_rated_movies['movieId'].tolist()

# Dodatno filtriranje preporučenih filmova
filtered_recommendations = []
for movie_id, similarity in user_recommendations:
    # Provera da li je korisnik ocenio film
    if movie_id not in user_rated_movies_list:
        if movie_id in movies['movieId'].values:
            # Filtriranje preporučenih filmova na osnovu sličnosti i visokih ocena
            similar_genres = movies.loc[movies['movieId'] == movie_id, 'genres'].values[0]
            user_high_rated_genres = user_rated_movies[user_rated_movies['rating'] >= 3.5]['movieId'].apply(lambda x: movies.loc[movies['movieId'] == x, 'genres'].values[0])
            
            # Provera da li postoji sličnost žanrova i visoka ocena u korisnikovim ocenama
            if any(genre in similar_genres for genre in user_high_rated_genres):
                filtered_recommendations.append((movie_id, similarity))
# Prikazivanje rezultata
# print(filtered_recommendations)
recommended_movies = [movies.loc[i]['title'] for i, _ in filtered_recommendations[:10]]

print(f"Preporuke za korisnika sa ID {user_id}:")
for i, title in enumerate(recommended_movies):
    print(f"{i + 1}. {title}")

Preporuke za korisnika sa ID 1:
1. Big Bully
2. Antonia's Line
3. Last Summer in the Hamptons
4. In the Bleak Midwinter
5. Nueba Yol
6. Blue in the Face
7. Jeffrey
8. Ed Wood
9. Madness of King George, The
10. Roommates
