<a href="https://colab.research.google.com/github/apps1990/Apps/blob/master/recommendation_system_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Important installs

In [None]:
import numpy as np
import pandas as pd
import sklearn
# import matplotlib.pyplot as plt
# import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Import Datasets for "Movie Recommendation System"

In [None]:
#loading rating dataset
df_ur = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv")
df_ur.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# loading movie dataset
df_mv = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv")
df_mv.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Statistical Facts Checking

In [None]:
n_ratings = len(df_ur)
n_movies = len(df_ur['movieId'].unique())
n_users = len(df_ur['userId'].unique())

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")


Number of ratings: 100836
Number of unique movieId's: 9724
Number of unique users: 610
Average ratings per user: 165.3
Average ratings per movie: 10.37


In [None]:
user_freq = df_ur[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head()

Unnamed: 0,userId,n_ratings
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


In [None]:
user_freq.n_ratings.value_counts(bins=[15,18,21,24,27,30,35,40,50,60,70,80,90,120,150,180]).reset_index().sort_values('n_ratings', ascending=False)

Unnamed: 0,n_ratings,count
10,"(150.0, 180.0]",28
4,"(120.0, 150.0]",39
0,"(90.0, 120.0]",52
11,"(80.0, 90.0]",21
12,"(70.0, 80.0]",20
9,"(60.0, 70.0]",29
3,"(50.0, 60.0]",44
2,"(40.0, 50.0]",44
6,"(35.0, 40.0]",32
1,"(30.0, 35.0]",44


In [None]:
## Find Lowest and Highest rated movies:

# Averate rating of the movies
mean_rating = df_ur.groupby('movieId')[['rating']].mean()
mean_rating

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.920930
2,3.431818
3,3.259615
4,2.357143
5,3.071429
...,...
193581,4.000000
193583,3.500000
193585,3.500000
193587,3.500000


In [None]:
# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
df_mv.loc[df_mv['movieId'] == lowest_rated]

Unnamed: 0,movieId,title,genres
2689,3604,Gypsy (1962),Musical


In [None]:
# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
df_mv.loc[df_mv['movieId'] == highest_rated]

Unnamed: 0,movieId,title,genres
48,53,Lamerica (1994),Adventure|Drama


In [None]:
# show number of people who rated movie highest
df_ur[df_ur['movieId']==highest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
13368,85,53,5.0,889468268
96115,603,53,5.0,963180003


In [None]:
# show number of people who rated movie lowest
df_ur[df_ur['movieId']==lowest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
13633,89,3604,0.5,1520408880


In [None]:
## the above movies has very low dataset. We will use bayesian average
movie_stats = df_ur.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [None]:
movie_stats

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,215,3.920930
2,110,3.431818
3,52,3.259615
4,7,2.357143
5,49,3.071429
...,...,...
193581,1,4.000000
193583,1,3.500000
193585,1,3.500000
193587,1,3.500000


### User Matrix Creation with Sparse Matrix Representation

In [None]:
# Now, we create user-item matrix using scipy csr matrix
from scipy.sparse import csr_matrix

def create_matrix(df):

	N = len(df['userId'].unique())
	M = len(df['movieId'].unique())

	# Map Ids to indices
	user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
	movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

	# Map indices to IDs
	user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
	movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

	user_index = [user_mapper[i] for i in df['userId']]
	movie_index = [movie_mapper[i] for i in df['movieId']]

	X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))

	return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [None]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(df_ur)

In [None]:
X.toarray()

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [None]:
X

<9724x610 sparse matrix of type '<class 'numpy.float64'>'
	with 100836 stored elements in Compressed Sparse Row format>

Observe the shape of the Matrix "X" which is 9724 x 610 which means rows are represented by unique movies (i.e. movie_ids) while columns are represented by unique users (i.e. user_ids)

### Rule Based Recommendation-1 : Finding Similar Movies Based on user rattings

#### Function to generate similar movies with KNN methodology and 'cosine' similarity measure

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
"""
Find similar movies using KNN
"""
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):

 ## Initialising an empty dictionery
  neighbour_ids = []

 ## Getting respective movie index from the movie mapper using movie_id
  movie_ind = movie_mapper[movie_id]
 ## Generating a movie vector from the sparse matrix 'X' based on movie_id
  movie_vec = X[movie_ind]

 ## Starting k value from 1 with increamental value as '1' till the specified value of k
  k+=1

 ## Generating kNN object from Scikit learn library's NearestNeighbours class by specifying parameters. Here the Metric is cosine which finds cosine similarity
  kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)

 ## Fitting kNN object on X
  kNN.fit(X)

 ## Reshaping movie vector for calculation
  movie_vec = movie_vec.reshape(1,-1)

 ## evaluating neighbours based on specified distance. Cosine distance in this case
  neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
 ## For loop to generate output till the specified value of k and storing it in the earstwhile specified empty list 'neighbour_id'
  for i in range(0,k):
    n = neighbour.item(i)
    neighbour_ids.append(movie_inv_mapper[n])
  neighbour_ids.pop(0)
  return neighbour_ids

#### Generating Recommendations on Similar movies

In [None]:
movie_titles = dict(zip(df_mv['movieId'], df_mv['title']))

movie_id = 73881

similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]

print(f"Since you watched {movie_title}")
for i in similar_ids:
	print(movie_titles[i])

Since you watched 3 Idiots (2009)
Vacations in Prostokvashino (1980)
Priklyucheniya Kapitana Vrungelya (1979)
Last Year's Snow Was Falling (1983)
Investigation Held by Kolobki (1986)
Karlson Returns (1970)
Immigrant, The (1917)
Winter in Prostokvashino (1984)
Cheburashka (1971)
Adventures of Mowgli: The Kidnapping (1968)
Short Film About Love, A (Krótki film o milosci) (1988)


#### Validating the Generated Recommendations

In [None]:
df_mv[df_mv.title.str.contains("3 Idiots")]

Unnamed: 0,movieId,title,genres
7243,73881,3 Idiots (2009),Comedy|Drama|Romance


In [None]:
## Validations

print("3 Idiots' average ratting" ,df_ur[df_ur.movieId == 73881].rating.mean()) ## 3 idiots average ratting

print("Avg ratting of first recommended movie", df_ur[df_ur.movieId == 172587].rating.mean())
print("Avg ratting of second recommended movie", df_ur[df_ur.movieId == 172637].rating.mean())
print("Avg ratting of last (i.e. k = 10th) recommended movie", df_ur[df_ur.movieId == 38159].rating.mean())


3 Idiots' average ratting 4.75
Avg ratting of first recommended movie 5.0
Avg ratting of second recommended movie 5.0
Avg ratting of last (i.e. k = 10th) recommended movie 4.5


### Rule Based Recommendation-2 : Recommendations based on user preferences

#### Function to recommend movies based on user preference

In [None]:
def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10):

## Filtering user level informtion from user ratting df
	df1 = df_ur[df_ur['userId'] == user_id]

## Handling an exception scenario where incorrect user_id is provided
	if df1.empty:
		print(f"User with ID {user_id} does not exist.")
		return

## Finding our the highest rated movie by the user and getting its corresponding movie_id
	movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]

## Creating a dictionery where movie_id and movie_titles are stored
	movie_titles = dict(zip(df_mv['movieId'], df_mv['title']))

## Utilising the functions defined above to find similar movies based on user rattings
	similar_ids = find_similar_movies(movie_id, X, k)

## Getting the Movie Title which of the highest rated movie by the user
	movie_title = movie_titles.get(movie_id, "Movie not found")

## Handling exception case where particular movie is not found in the database
	if movie_title == "Movie not found":
		print(f"Movie with ID {movie_id} not found.")
		return

## Generating output which recommends movies based on highest rated movie by the user
	print(f"Since you watched {movie_title}, you might also like:")
	for i in similar_ids:
		print(movie_titles.get(i, "Movie not found"))

#### Generating Recommendations for a particular user

In [None]:
user_id = 8 # Replace with the desired user ID
recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)


Since you watched Babe (1995), you might also like:
Lion King, The (1994)
Jurassic Park (1993)
Aladdin (1992)
Fugitive, The (1993)
Speed (1994)
Mrs. Doubtfire (1993)
Mask, The (1994)
Beauty and the Beast (1991)
Toy Story (1995)
Ghost (1990)


In [None]:
df_mv[df_mv.title.str.contains("Babe ()")]

  df_mv[df_mv.title.str.contains("Babe ()")]


Unnamed: 0,movieId,title,genres
32,34,Babe (1995),Children|Drama


In [None]:
## Validations

print("Babe (1995) average ratting" ,df_ur[df_ur.movieId == 34].rating.mean()) ## 3 idiots average ratting

print("Avg ratting of first recommended movie", df_ur[df_ur.movieId == 364].rating.mean())
print("Avg ratting of second recommended movie", df_ur[df_ur.movieId == 480].rating.mean())
print("Avg ratting of last (i.e. k = 10th) recommended movie", df_ur[df_ur.movieId == 587].rating.mean())

Babe (1995) average ratting 3.65234375
Avg ratting of first recommended movie 3.941860465116279
Avg ratting of second recommended movie 3.75
Avg ratting of last (i.e. k = 10th) recommended movie 3.4347826086956523
