In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random

In [4]:

movie=pd.read_csv('../input/movielens-20m-dataset/movie.csv',index_col = False)
rating2=pd.read_csv('../input/movielens-20m-dataset/rating.csv')
tag=pd.read_csv('../input/movielens-20m-dataset/tag.csv')


In [5]:
rating=rating2.sample(frac=0.5,random_state=4259)
movie_taken=rating['movieId'].tolist()

In [6]:
movie.head()

In [7]:
movie.info() #to check null value inputs

In [8]:
#to check the unique entries in the movieId and title columns
print(len(movie['movieId'].unique()),
len(movie['title'].unique()))

Analysing the Duplicated title entries

In [9]:
movie[movie['title'].duplicated()]

In [10]:
# movie[movie['title']== 'Offside (2006)']
movie[movie['title']== 'Johnny Express (2014)']

In [11]:
#dropping the duplicated movie title entries
movie.drop_duplicates(subset='title',ignore_index=True)

In [12]:
rating

In [13]:
rating=rating.drop(['timestamp'],axis=1)


In [14]:
#to find the null values accross any column entry
rating.isnull().sum()

In [15]:
rating.movieId.value_counts().describe()

In [16]:
movie_rating_freq=rating.groupby('movieId')['rating'].count().reset_index()
user_rating_freq=rating.groupby('userId')['rating'].count().reset_index()

movie_rating_freq.rename(columns={'rating':'number of rating'},inplace=True)
movie_rating_freq

In [17]:
movie_rating_freq1=movie_rating_freq[movie_rating_freq['number of rating']>200]
list_id=movie_rating_freq1['movieId'].tolist()

#trimming all the enteries where a movie is not rated enough
# # rating=rating[rating['number of rating']>150]
# rating.drop_duplicates(['movieId','userId'],inplace=True)
rating=rating[rating['movieId'].isin(list_id)]
rating.shape

In [18]:
#merging the DataFrame
rating=rating.merge(movie,on='movieId')
rating

In [19]:
tag.head()

In [20]:
tag=tag[tag.movieId.isin(movie_taken)]

**Exploring the Data**

In [21]:
total_user=rating['userId'].nunique()
print("Total Number of Users",total_user)

In [22]:
total_movies=rating.movieId.nunique()
print("Total Movie Titles",total_movies)

In [23]:
rating.rating.describe()

In [24]:
rating.rating.value_counts()


In [25]:
print("Most common rating to all the movies",rating.rating.mode())
print("Average ratings goven to the data set",rating.rating.mean())

In [26]:
plt.figure(figsize=(25,10))
sns.countplot(rating['rating'],palette='muted')
plt.xlabel('Ratings')
plt.ylabel('Total Rating Count')
plt.show()

In [27]:
#to check the active users
rating.userId.value_counts().describe()


In [28]:
#to check the actively watched movies
rating.movieId.value_counts().describe()

In [29]:
tag_counts = tag['tag'].value_counts()
tag_counts[-10:]

In [30]:
tag_counts[:10].plot(kind='bar', figsize=(10,5))

**Collabrative Filtering**

In [31]:
#Encoding the Uder_Id and Movie_ID for easy index access 
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
rating['userId']=le.fit_transform(rating['userId'])
rating['movieId']=le.fit_transform(rating['movieId'])

In [32]:
# rating.title.unique().tolist()

In [33]:
#to check if we get a sparse matrix
print("Total ratings in ideal case, when each user rates every movie",rating.movieId.nunique()* rating['userId'].nunique())
print("Actual number of ratings",rating.shape[0])
print("Percentage ratings",(rating.shape[0]/(rating.movieId.nunique()* rating['userId'].nunique()))*100,"%")

1.4% will give us a sparse matrix, hence we need to use the matrix factorization technique


In [34]:
#Creating the pivot matrix
ratings_mat = np.ndarray(shape=(np.max(rating.movieId.values), np.max(rating.userId.values)),dtype=np.uint8)
ratings_mat[rating.movieId.values-1, rating.userId.values-1] = rating.rating.values

In [35]:
ratings_mat.shape

In [36]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

#Computing the Singular Value Decomposition (SVD)
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
# U, S, V = np.linalg.svd(A)

from scipy.sparse.linalg import svds
# num_components = 2
U, S, V = svds(A)

In [39]:
#Function to calculate the cosine similarity sorting by most similar and returning the top N
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1 in the dataset
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

# Function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('\n Recommendations for {0}: \n'.format(
    movie_data[movie_data.movieId == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print("{0:<70}".format(movie_data[movie_data.movieId == id]["title"].values[0]),rating[rating.movieId == id]["genres"].values[0])

In [40]:
#k-principal components to represent movies, movie_id to find recommendations, top_n print n results        
k = 80
top_n = 10
sliced = V.T[:, :k] # representative data

In [41]:
titles=["3 Idiots (2009)","Fight Club (1999)","Ratatouille (2007)","Primer (2004)","Life of Pi (2012)","Slumdog Millionaire (2008)","Darjeeling Limited, The (2007)","Lord of the Rings: The Fellowship of the Ring, The (2001)"]
for title in titles:
    movie_id = int(rating[rating.title == title].movieId.unique()) # (getting an id from movies.dat)
    indexes = top_cosine_similarity(sliced, movie_id, top_n)
    print_similar_movies(rating, movie_id, indexes)

In [40]:
title="Fight Club (1999)"
movie_id = int(rating[rating.title == title].movieId.unique()) # (getting an id from movies.dat)
indexes = top_cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(rating, movie_id, indexes)

In [41]:
title="Ratatouille (2007)"
movie_id = int(rating[rating.title == title].movieId.unique()) # (getting an id from movies.dat)
indexes = top_cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(rating, movie_id, indexes)

In [42]:
title="Bhaag Milka Bhaag (2013)"
rating[rating.title == title].movieId.unique()

# movie_id = int(rating[rating.title == title].movieId.unique()) # (getting an id from movies.dat)
# indexes = top_cosine_similarity(sliced, movie_id, top_n)
# print_similar_movies(rating, movie_id, indexes)

In [43]:
title="Life of Pi (2012)"
movie_id = int(rating[rating.title == title].movieId.unique()) # (getting an id from movies.dat)
indexes = top_cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(rating, movie_id, indexes)

In [44]:
title="Primer (2004)"
movie_id = int(rating[rating.title == title].movieId.unique()) # (getting an id from movies.dat)
indexes = top_cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(rating, movie_id, indexes)

In [45]:
# print("{0:>5}".format("huhj"),"uij")

In [None]:
# mat=pd.pivot_table(data=rating,columns='movieId',index='userId')

In [None]:
# mat=mat.fillna(0)
# # mat

In [None]:
# import re
# from scipy.sparse import csr_matrix
# import matplotlib.pyplot as plt
# import seaborn as sns
# from surprise import Reader, Dataset, SVD, NMF
# from surprise.model_selection.validation import cross_validate
# sns.set_style("darkgrid")

In [None]:
# reader = Reader()

# data = Dataset.load_from_df(rating, reader)
# svd = SVD()
# # Run 5-fold cross-validation and print results
# cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
# trainset = data.build_full_trainset()
# svd.fit(trainset)


In [None]:
# pred=svd.test(trainset.build_testset())

In [None]:
# svd.pred

In [None]:
# reader = Reader()

# data = Dataset.load_from_df(rating, reader)
# nmf = NMF()
# # Run 5-fold cross-validation and print results
# cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
# rating['movieId'].apply(lambda x: svd.predict(1, x).est)

In [None]:
# def recommend(movie_title, min_count, mat):
#     print("For movie ({})".format(movie_title))
#     print("- Top 10 movies recommended based on Pearsons' R correlation - ")
#     i = int(movie[movie['title']==movie_title].movieId)
#     target = mat.loc[i]
# #     print(target)
#     similar_to_target = mat.corrwith(target,axis=1).sort_values(ascending=False).head(6)
#     print(similar_to_target)
# #     rating[rating['movieId'].isin()]
#     print(movie[movie['movieId'].isin(list(similar_to_target))]['title'])
# #     corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
# #     corr_target.dropna(inplace = True)
# #     corr_target = corr_target.sort_values('PearsonR', ascending = False)
# #     corr_target.index = corr_target.index.map(int)
# #     corr_target = corr_target.join(df_title).join(df_movie_summary)[['PearsonR', 'Name', 'count', 'mean']]
# #     print(corr_target[corr_target['count']>min_count][:10].to_string(index=False))
# # #     return list()

In [None]:
# recommend('The Pirates (2014)',2,mat)
