In [None]:

from sklearn.neighbors import NearestNeighbors
import pandas as pd
genre_df = pd.read_csv("./data/genre_data.csv", index_col=0)
movies = pd.read_json("./data/movie_data.json")


model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=10)
model_knn.fit(genre_df.values)

import joblib

# Save MultiLabelBinarizer
joblib.dump(model_knn, "./pickles/rec_knn_model.joblib")

# print("Movies similar to:", movies.iloc[movie_index]["movie_title"])
# for i in range(1, len(distances.flatten())):
#     idx = indices.flatten()[i]
#     print(f"{i} -  index : {movies.iloc[idx].index} {movies.iloc[idx]['movie_title']} (distance={distances.flatten()[i]:.2f})")


['rec_knn_model.joblib']

In [285]:
selected_movie_index = 100
# movies.iloc[selected_movie_index]["genres"] = movies.loc[selected_movie_index]["genres"].split("|")
movies.iloc[selected_movie_index]
#print(movies.dtypes)

movie_title                                The Fast and the Furious 
genres                                     [Action, Crime, Thriller]
director_name                                              Rob Cohen
duration                                                       106.0
language                                                     English
country                                                          USA
title_year                                                    2001.0
imdb_score                                                       6.7
movie_imdb_link    http://www.imdb.com/title/tt0232500/?ref_=fn_t...
Name: 100, dtype: object

## a - Cosine Similarity

In [292]:
import joblib
import ast 

mlb = joblib.load("mlb_genre_encoder.joblib")

# Example: Predict for a new movie
target_genres =["Action"]# movies.iloc[selected_movie_index]["genres"]
print(target_genres)
new_movie_vec = mlb.transform([target_genres])
genre_df = pd.DataFrame(new_movie_vec, columns=mlb.classes_)
genre_df

['Action']


Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [297]:

# Pick a movie to recommend similar ones
distances, indices = model_knn.kneighbors(new_movie_vec, n_neighbors=10)

for i in range(1, len(distances.flatten())):
    idx = indices.flatten()[i]
    sim_score = 1 - distances.flatten()[i]  # cosine similarity = 1 - distance
    print(f"Recommended: {movies.iloc[idx]['movie_title']} Genres : {movies.iloc[idx]['genres']} | Similarity: {sim_score:.2f}")

Recommended: Diamond Ruff  Genres : ['Action'] | Similarity: 1.00
Recommended: Out of Inferno  Genres : ['Action'] | Similarity: 1.00
Recommended: Code of Honor  Genres : ['Action'] | Similarity: 1.00
Recommended: Ong-bak 2  Genres : ['Action'] | Similarity: 1.00
Recommended: The Man with the Iron Fists  Genres : ['Action'] | Similarity: 1.00
Recommended: Reign of Assassins  Genres : ['Action'] | Similarity: 1.00
Recommended: Excessive Force  Genres : ['Action'] | Similarity: 1.00
Recommended: Batman Returns  Genres : ['Action'] | Similarity: 1.00
Recommended: Kickboxer: Vengeance  Genres : ['Action'] | Similarity: 1.00


## b - Jaccard Similarity on Genres

Measure overlap of genres between target and recommended movies:

1.0 → all genres match

0.0 → no overlap

In [298]:
def jaccard(genres1, genres2):
    set1, set2 = set(genres1), set(genres2)
    return len(set1 & set2) / len(set1 | set2)

for i in range(1, len(indices.flatten())):
    idx = indices.flatten()[i]
    rec_genres = movies.iloc[idx]["genres"]
    score = jaccard(target_genres, rec_genres)
    print(f"Recommended: {movies.iloc[idx]['movie_title']} Genres : {movies.iloc[idx]['genres']} | Jaccard similarity: {score:.2f}")


Recommended: Diamond Ruff  Genres : ['Action'] | Jaccard similarity: 1.00
Recommended: Out of Inferno  Genres : ['Action'] | Jaccard similarity: 1.00
Recommended: Code of Honor  Genres : ['Action'] | Jaccard similarity: 1.00
Recommended: Ong-bak 2  Genres : ['Action'] | Jaccard similarity: 1.00
Recommended: The Man with the Iron Fists  Genres : ['Action'] | Jaccard similarity: 1.00
Recommended: Reign of Assassins  Genres : ['Action'] | Jaccard similarity: 1.00
Recommended: Excessive Force  Genres : ['Action'] | Jaccard similarity: 1.00
Recommended: Batman Returns  Genres : ['Action'] | Jaccard similarity: 1.00
Recommended: Kickboxer: Vengeance  Genres : ['Action'] | Jaccard similarity: 1.00


In [317]:
import joblib
import pandas as pd

knn_model = joblib.load('./pickles/rec_knn_model.joblib')
mlb_model = joblib.load('./pickles/mlb_genre_encoder.joblib')

movies_df =  pd.read_json("./data/movie_data.json")

# kneighbors
def get_recommendations(genres: list, k=6):
    genres_vec = mlb_model.transform([genres])
    distances, indices = model_knn.kneighbors(genres_vec, n_neighbors=k)
    rec_movies = []

    for i in range(1, len(distances.flatten())):
        idx = indices.flatten()[i]
        sim_score = 1 - distances.flatten()[i]  # cosine similarity = 1 - distance
        current_movie = movies_df.loc[idx].to_dict()
        current_movie["similarity"] = sim_score * 100
        rec_movies.append(current_movie)

        print(current_movie)

    return rec_movies, len(rec_movies)



ok, l = get_recommendations(["Action"])
print(l, ok)


{'movie_title': 'The Mudge Boy\xa0', 'genres': ['Crime', 'Drama', 'Romance'], 'director_name': 'Michael Burke', 'duration': 94.0, 'language': 'English', 'country': 'USA', 'title_year': 2003.0, 'imdb_score': 7.2, 'movie_imdb_link': 'http://www.imdb.com/title/tt0339419/?ref_=fn_tt_tt_1', 'similarity': np.float64(100.0)}
{'movie_title': 'A Cinderella Story\xa0', 'genres': ['Comedy', 'Family', 'Romance'], 'director_name': 'Mark Rosman', 'duration': 95.0, 'language': 'English', 'country': 'USA', 'title_year': 2004.0, 'imdb_score': 5.9, 'movie_imdb_link': 'http://www.imdb.com/title/tt0356470/?ref_=fn_tt_tt_1', 'similarity': np.float64(100.0)}
{'movie_title': 'Modern Problems\xa0', 'genres': ['Comedy', 'Fantasy', 'Sci-Fi'], 'director_name': 'Ken Shapiro', 'duration': 93.0, 'language': 'English', 'country': 'USA', 'title_year': 1981.0, 'imdb_score': 5.0, 'movie_imdb_link': 'http://www.imdb.com/title/tt0082763/?ref_=fn_tt_tt_1', 'similarity': np.float64(100.0)}
{'movie_title': 'Brazil\xa0', 'ge