### Lib


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError


### import data

In [2]:
movies_df = pd.read_csv(r'C:\Users\DELL\Desktop\ITI\R_S\reco project\data\movies.csv')
ratings_df = pd.read_csv(r'C:\Users\DELL\Desktop\ITI\R_S\reco project\data\ratings.csv')


### data preprocessing

In [3]:
merged_df = pd.merge(ratings_df, movies_df, on='movieId')


In [4]:
genres_split = merged_df['genres'].str.get_dummies(sep='|')
df = pd.concat([merged_df, genres_split], axis=1).drop('genres', axis=1)


In [5]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
df['userId'] = user_encoder.fit_transform(df['userId'])
df['movieId'] = movie_encoder.fit_transform(df['movieId'])


In [6]:
df.drop(['timestamp', 'title'], axis=1, inplace=True)


In [7]:
df

Unnamed: 0,userId,movieId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,4.0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,4.0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,4.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,14,0,2.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,16,0,4.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,609,9307,2.5,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
100832,609,9312,4.5,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
100833,609,9324,3.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
100834,609,9371,3.5,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


### user-item sim

In [8]:
user_item_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix


movieId,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
606,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
genre_columns = df.columns[5:]
movie_genres = df[['movieId'] + list(genre_columns)].drop_duplicates().set_index('movieId')


In [10]:
genre_similarity = cosine_similarity(movie_genres)
genre_similarity_df = pd.DataFrame(genre_similarity, index=movie_genres.index, columns=movie_genres.index)


###  recommender fun

In [11]:

def recommend_movies(movie_name, N=5):
    movie_id = movies_df[movies_df['title'] == movie_name]['movieId'].values[0]
    movie_id_encoded = movie_encoder.transform([movie_id])[0]
    
    similarity_scores = genre_similarity_df[movie_id_encoded]
    
    similar_movie_ids = similarity_scores.sort_values(ascending=False).index[1:N+1]
    similar_movie_ids_original = movie_encoder.inverse_transform(similar_movie_ids)
    
    recommended_movies = movies_df[movies_df['movieId'].isin(similar_movie_ids_original)]['title']
    
    return recommended_movies.tolist()

print(recommend_movies("Toy Story (1995)", 10))


['Antz (1998)', 'Toy Story 2 (1999)', "Emperor's New Groove, The (2000)", 'Monsters, Inc. (2001)', "Twelve Tasks of Asterix, The (Les douze travaux d'Astérix) (1976)", 'TMNT (Teenage Mutant Ninja Turtles) (2007)', 'Tale of Despereaux, The (2008)', 'Turbo (2013)', 'The Lego Movie (2014)', 'The Good Dinosaur (2015)']


### saveing the model

In [17]:
genre_similarity_df.to_csv(r'C:\Users\DELL\Desktop\ITI\R_S\reco project\data\genre_similarity.csv', index=True, header=True)


In [14]:
genre_similarity_df

movieId,0,2,5,43,46,62,89,97,124,130,...,9136,9138,9157,9213,9274,9307,9312,9324,9371,9372
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.000000,0.316228,0.000000,0.000000,0.000000,0.258199,0.447214,0.0,0.000000,0.316228,...,0.447214,0.0,0.258199,0.000000,0.316228,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.316228,1.000000,0.000000,0.000000,0.000000,0.408248,0.707107,0.0,0.408248,0.500000,...,0.000000,0.0,0.408248,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,1.000000,0.500000,0.816497,0.408248,0.353553,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.500000,0.707107,0.500000,0.500000,0.500000,0.000000
43,0.000000,0.000000,0.500000,1.000000,0.816497,0.408248,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.500000,0.707107,0.000000,0.500000,0.500000,0.000000
46,0.000000,0.000000,0.816497,0.816497,1.000000,0.333333,0.288675,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.408248,0.577350,0.408248,0.408248,0.408248,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9307,0.000000,0.000000,0.707107,0.707107,0.577350,0.577350,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.707107,1.000000,0.000000,0.707107,0.707107,0.000000
9312,0.000000,0.000000,0.500000,0.000000,0.408248,0.000000,0.353553,0.5,0.408248,0.000000,...,0.000000,0.0,0.408248,0.000000,0.000000,0.000000,1.000000,0.500000,0.000000,0.000000
9324,0.000000,0.000000,0.500000,0.500000,0.408248,0.408248,0.000000,0.5,0.408248,0.000000,...,0.000000,0.0,0.408248,0.000000,0.500000,0.707107,0.500000,1.000000,0.500000,0.000000
9371,0.000000,0.000000,0.500000,0.500000,0.408248,0.816497,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.408248,0.707107,0.500000,0.707107,0.000000,0.500000,1.000000,0.707107


In [12]:
import pickle as pkl
pkl.dump(genre_similarity_df, open(r'C:\Users\DELL\Desktop\ITI\R_S\reco project\data\genre_similarity.pkl', 'wb'))


In [15]:
sim=pkl.load(open(r'C:\Users\DELL\Desktop\ITI\R_S\reco project\data\genre_similarity.pkl', 'rb'))
sim

movieId,0,2,5,43,46,62,89,97,124,130,...,9136,9138,9157,9213,9274,9307,9312,9324,9371,9372
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.000000,0.316228,0.000000,0.000000,0.000000,0.258199,0.447214,0.0,0.000000,0.316228,...,0.447214,0.0,0.258199,0.000000,0.316228,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.316228,1.000000,0.000000,0.000000,0.000000,0.408248,0.707107,0.0,0.408248,0.500000,...,0.000000,0.0,0.408248,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,1.000000,0.500000,0.816497,0.408248,0.353553,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.500000,0.707107,0.500000,0.500000,0.500000,0.000000
43,0.000000,0.000000,0.500000,1.000000,0.816497,0.408248,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.500000,0.707107,0.000000,0.500000,0.500000,0.000000
46,0.000000,0.000000,0.816497,0.816497,1.000000,0.333333,0.288675,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.408248,0.577350,0.408248,0.408248,0.408248,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9307,0.000000,0.000000,0.707107,0.707107,0.577350,0.577350,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.707107,1.000000,0.000000,0.707107,0.707107,0.000000
9312,0.000000,0.000000,0.500000,0.000000,0.408248,0.000000,0.353553,0.5,0.408248,0.000000,...,0.000000,0.0,0.408248,0.000000,0.000000,0.000000,1.000000,0.500000,0.000000,0.000000
9324,0.000000,0.000000,0.500000,0.500000,0.408248,0.408248,0.000000,0.5,0.408248,0.000000,...,0.000000,0.0,0.408248,0.000000,0.500000,0.707107,0.500000,1.000000,0.500000,0.000000
9371,0.000000,0.000000,0.500000,0.500000,0.408248,0.816497,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.408248,0.707107,0.500000,0.707107,0.000000,0.500000,1.000000,0.707107


In [8]:
loaded_sparse_matrix = load_npz(r'C:\Users\DELL\Desktop\ITI\R_S\reco project\data\sparse_similarity_matrix.npz')
loaded_genre_similarity_df = pd.DataFrame(loaded_sparse_matrix.toarray(), index=genre_similarity_df.index, columns=genre_similarity_df.columns)
