In [None]:
#Section 3
#Combination Content-Based Recommendation AND Collaborative Filtering :

#start with Content-Based Recommendation AND
#Obtain code movies that are related to the user's interest

import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

ratings = pd.read_csv("/content/sample_data/rating.csv")

metadata = pd.read_csv("/content/sample_data/anime.csv")


#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Construct a reverse map of indices and movie id
indices = pd.Series(metadata.index, index=metadata['anime_id']).drop_duplicates()


# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# Apply clean_data function to your features.
features = ['name', 'genre', 'type', 'rating']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)


def create_soup(x):
    return ' ' + x['name'] + ' ' + x['genre'] + ' ' + x['type'] + ' ' + x['rating']

# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)


# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['anime_id']).drop_duplicates()
indices_na = pd.Series(metadata.index, index=metadata['name']).drop_duplicates()

# get top videos that user has watched
def get_movies(user_id):

  select = ratings.loc[ratings['user_id'] == user_id]
  sel = select[['anime_id', 'rating']]
  sel_sorted = sel.sort_values(by = 'rating', ascending = False)

  sel_mov = sel_sorted['anime_id'].tolist()

  all_mov = sel_sorted['anime_id'].tolist()

  m_top = sel_mov[0:20]

  return m_top , all_mov


N = len(metadata['name'])
	
name_dic = dict(zip(list(range(N)), (metadata["name"])))
id_dic = dict(zip(list(range(N)), (metadata["anime_id"])))


# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(top_movies, cosine_sim=cosine_sim2):

  mov_id = []

  for i in top_movies:

    # Get the index of the movie that matches the title
    idx = indices[i]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the most similar movies
    sim_scores = sim_scores[1:41]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    for j in movie_indices:

     vid_id = id_dic[j]
     mov_id.append(vid_id)

 # Return the top most similar movies
  return mov_id


# user_id = 3
top_movies , all_mov = get_movies(3)

mov_id = get_recommendations(top_movies , cosine_sim2)

mov_id = mov_id + all_mov

u_mov_id = set(mov_id)

#Now we got the code of the movies
#that are related to the user's interest

print("Number of movie codes related to user interest :")
print(len(u_mov_id))
print("")

#limit rating dataset with this codes (u_mov_id)
new_rate = ratings.loc[ratings['anime_id'].isin(u_mov_id)]

print("dimension of new rating dataset :")
print(new_rate.shape)
print("dimension of old rating dataset :")
print(ratings.shape)


#Now we apply collaborative filtering on new dataset (new_rate)

movies = pd.read_csv("/content/sample_data/anime.csv")

n_ratings = len(new_rate)
n_movies = len(new_rate['anime_id'].unique())
n_users = len(new_rate['user_id'].unique())

user_freq = new_rate[['user_id', 'anime_id']].groupby('user_id').count().reset_index()
user_freq.columns = ['user_id', 'n_ratings']
user_freq.head()

# create user-item matrix using scipy csr matrix
from scipy.sparse import csr_matrix

def create_matrix(df):
	
	N = len(df['user_id'].unique())
	M = len(df['anime_id'].unique())
	
	# Map Ids to indices
	user_mapper = dict(zip(np.unique(df["user_id"]), list(range(N))))
	movie_mapper = dict(zip(np.unique(df["anime_id"]), list(range(M))))
	
	# Map indices to IDs
	user_inv_mapper = dict(zip(list(range(N)), np.unique(df["user_id"])))
	movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["anime_id"])))
	
	user_index = [user_mapper[i] for i in df['user_id']]
	movie_index = [movie_mapper[i] for i in df['anime_id']]

	X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
	
	return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(new_rate)


from sklearn.neighbors import NearestNeighbors

#Find similar movies using KNN
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
      
    neighbour_ids = []
      
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids


movie_titles = dict(zip(movies['anime_id'], movies['name']))

print("")
print("for user id 3")
print("")

print("Suggested videos :")
print("")

rec_mov = []

for i in top_movies:

 similar_ids = find_similar_movies(i, X, k=4)
 for j in similar_ids:
	 if (j not in rec_mov) and (j not in all_mov):
		 	 rec_mov.append(j)

ten_mov = rec_mov[0:10]

for i in ten_mov:
	
 print(movie_titles[i])
 print("")
