# CONTENT RECOMMENDATION


In [None]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/data"

In [None]:
#load tags and ratings data and number of unique movies
tags_con = pd.read_csv(f"{path}/tags.csv")
ratings_con = pd.read_csv(f"{path}/ratings.csv")

In [None]:
#calculate average rating count based on each movie
df_value_counts = ratings_con['movieId'].value_counts().sort_values()
df_value_counts.describe()

count    59047.000000
mean       423.393144
std       2477.885821
min          1.000000
25%          2.000000
50%          6.000000
75%         36.000000
max      81491.000000
Name: count, dtype: float64

In [None]:
#filter movies has more than rate_count rates

rate_count = 375
movieIds = df_value_counts[df_value_counts > rate_count].index.tolist()

In [None]:
#map movie index for movie data
movies_con = pd.read_csv(f"{path}/movies.csv")

In [None]:
#filter movies dataset

movies_con = movies_con[movies_con.movieId.isin(movieIds)]
movies_con.shape

(6166, 3)

In [None]:
movies_con['genres'] = movies_con['genres'].str.replace(pat="|", repl=" ")
movies_con['genres'] = movies_con['genres'].str.replace(pat="-", repl="")

In [None]:
#create documents from tags
tags_con.fillna("", inplace=True)
tags_con = pd.DataFrame(tags_con.groupby('movieId')['tag'].apply(lambda x: "{%s}" % ' '.join(x)))
tags_con.reset_index(inplace=True)

In [None]:
#filter tags dataset

tags_con = tags_con[tags_con.movieId.isin(movieIds)]
tags_con.shape

(6162, 2)

In [None]:
# add genres to document
tags_con = pd.merge(movies_con, tags_con, on="movieId", how="left")
#create documents from tags
tags_con.fillna("", inplace=True)

In [None]:
tags_con['document'] = tags_con[['tag', 'genres']].apply(lambda x: ' '.join(x), axis=1)

In [None]:
#create documents from tags
tags_con.fillna("", inplace=True)

In [None]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity


# Data Preparation: Convert documents into TaggedDocument objects
documents = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(tags_con['document'])]

# Doc2Vec Model Training
# Specify model parameters
doc2vec_model = Doc2Vec(
    vector_size=100, # Vector size
    window=5,        # Maximum distance between words
    min_count=1,     # Words with minimum frequency
    workers=4,       # number of CPU cores
    epochs=10        # Number of training epochs
)

# Build documents to train the model
doc2vec_model.build_vocab(documents)

# Train the model
doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# 3. Calculation of Document Vectors
# Get document vectors
doc_vectors = [doc2vec_model.dv[i] for i in range(len(documents))]

# Convert document vectors to DataFrame
doc_vectors_df = pd.DataFrame(doc_vectors, index=tags_con.index)

# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(doc_vectors_df)

# Convert Cosine similarity matrix to DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=tags_con['title'], columns=tags_con['title'])


In [None]:
cosine_sim_df.iloc[:3, :3]

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story (1995),1.0,0.375118,0.464958
Jumanji (1995),0.375118,1.0,0.308757
Grumpier Old Men (1995),0.464958,0.308757,1.0


In [None]:
def content_recommendations(movie_titles, num_recommendations):
    # Get similarity scores using movie titles as index
    similar_movies = pd.Series(dtype=float)

    for movie_title in movie_titles:
        if movie_title in cosine_sim_df.columns:
            similar_movies = pd.concat([similar_movies, cosine_sim_df[movie_title]])

    # Take the most similar movies except for himself
    similar_movies = similar_movies.groupby(similar_movies.index).mean().sort_values(ascending=False)
    similar_movies = similar_movies[~similar_movies.index.isin(movie_titles)]

    return similar_movies.head(num_recommendations)

# COLLABORATIVE EMBEDDING

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
ratings_col = pd.read_csv(f"{path}/ratings.csv")
movies_col = pd.read_csv(f"{path}/movies.csv")

In [None]:
ratings_col = ratings_col[ratings_col.movieId.isin(movieIds)]

In [None]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=551929 sha256=0cfb8a4f83a4b76e637138e73816b5df0d3b60db0b86a1852e782e670bacbaef
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [None]:
from scipy.sparse import csr_matrix
from annoy import AnnoyIndex

# Create User-Film matrix
user_movie_matrix = ratings_col.pivot(index='userId', columns='movieId', values='rating')
user_movie_matrix = user_movie_matrix.fillna(0)

# Convert to sparse matrix
user_movie_sparse = csr_matrix(user_movie_matrix)

# Create annoy index
f = user_movie_sparse.shape[1]  # Number of movies
annoy_index = AnnoyIndex(f, 'angular')  # Angular distance is used for cosine similarity

# Add users to annoy index
for i, user_vector in enumerate(user_movie_sparse):
    annoy_index.add_item(i, user_vector.toarray()[0])

# Build Annoy index
annoy_index.build(10)  # Build with 10 trees

True

In [None]:
movies_col['title'] = movies_col['title'].str.strip()

In [None]:
def get_movie_ids(movies, movie_titles):
    movie_ids = movies[movies['title'].isin(movie_titles)]['movieId'].tolist()
    return movie_ids

def collaborative_recommendations(movie_titles, user_movie_matrix, annoy_index, movies, n_recommendations):
    # Get the movie_ids of the movies
    movie_ids = get_movie_ids(movies, movie_titles)

    # Find users who like the movies
    movie_users = set()
    for movie_id in movie_ids:
        users = set(user_movie_matrix[user_movie_matrix[movie_id] > 0].index.tolist())
        movie_users = movie_users.union(users)

    movie_users = list(movie_users)

    # Find similar users for the first user (for simplicity we take the first user)
    if movie_users:
        similar_users = []
        for user in movie_users[:10]:  #Get the first 10 users
            similar_users.extend(annoy_index.get_nns_by_item(user, 10))

        # Filter repeat users
        similar_users = list(set(similar_users))

        # Get movies liked by similar users
        similar_users_ratings = user_movie_matrix.iloc[similar_users]
        movie_recommendations = similar_users_ratings.sum().sort_values(ascending=False).index.tolist()

        # Filter suggestions
        recommended_movie_ids = [m for m in movie_recommendations if m not in movie_ids]

        if len(recommended_movie_ids) > n_recommendations:
            recommended_movie_ids = np.random.choice(recommended_movie_ids, n_recommendations, replace=False).tolist()
        else:
            recommended_movie_ids = recommended_movie_ids[:n_recommendations]

        # Get the first n suggestions
        recommended_movie_titles = movies[movies['movieId'].isin(recommended_movie_ids)]['title'].tolist()

        return recommended_movie_titles[:n_recommendations]
    else:
        return []

# HYBRID MOVIE PREDICTION

In [None]:
def hybrid_recommendations(movie_titles, user_movie_matrix, annoy_index, movies, num_recommendations):
    # Get collaborative filtering recommendations
    collaborative_recs = collaborative_recommendations(movie_titles, user_movie_matrix, annoy_index, movies, num_recommendations*2)

    # Get content-based filtering recommendations
    content_recs = content_recommendations(movie_titles, num_recommendations*2)

    # Combine and sort the recommendations
    combined_recs = pd.concat([pd.Series(collaborative_recs), pd.Series(content_recs.index)])
    combined_recs = combined_recs.value_counts().index.tolist()

    # Get as many recommendations as the first num_recommendations
    final_recommendations = combined_recs[:num_recommendations]
    final_recommendations = pd.DataFrame(final_recommendations, columns=['recommended_movie'])

    return final_recommendations

In [None]:
sample_1= ["Indiana Jones and the Last Crusade (1989)",
        "Flight of the Phoenix (2004)",
        "Hercules (2014)",
        "Spectre (2015)",
        "Dead Presidents (1995)",
        "Seven (a.k.a. Se7en) (1995)" ,
        "Tsotsi (2005)"]

In [None]:
recommendations = hybrid_recommendations(sample_1, user_movie_matrix, annoy_index, movies_col, 10)
recommendations

Unnamed: 0,recommended_movie
0,Smoke (1995)
1,M (1931)
2,Broken Arrow (1996)
3,Mighty Aphrodite (1995)
4,Fire Down Below (1997)
5,Wishmaster (1997)
6,Day & Night (2010)
7,Beverly Hills Cop III (1994)
8,Escape from Alcatraz (1979)
9,"Color of Paradise, The (Rang-e khoda) (1999)"


In [None]:
deneme= ["Interstellar (2014)"]


In [None]:
recommendations = hybrid_recommendations(deneme, user_movie_matrix, annoy_index, movies_col, 10)
recommendations

Unnamed: 0,recommended_movie
0,Once Upon a Time... When We Were Colored (1995)
1,Lord of Illusions (1995)
2,"Brief History of Time, A (1991)"
3,Contact (1997)
4,"Lake House, The (2006)"
5,"Jacket, The (2005)"
6,Deep Impact (1998)
7,I Origins (2014)
8,Arrival (2016)
9,Kate & Leopold (2001)
