# Read the preprocessed Data

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
df_prelim=pd.read_csv("../../Data/ml-latest-small/PreprocessedData_ml_latest_year_small.csv")
df_prelim.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995


# Preprossing for CF

In [3]:
# Delete unnecessary columns
df=df_prelim.drop(['title','year','genres'],axis=1).drop_duplicates()

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5


In [8]:
df.shape

(100833, 3)

In [9]:
len(df.movieId.unique())

9719

In [10]:
len(df.userId.unique())

610

## User Input

In [5]:
def AskForUserInput_0():
    fav_movie=input("Enter your Favorite Movie: ")
    n=0
    
    movies=df["title"].replace("(\(.*?\))", "").str.strip().str.lower().unique() 
    #upper case dependency removed
    #year removed
    
    while fav_movie not in movies and n<5:
        print("The Movie ", fav_movie," does not exist in our database.")
        fav_movie = input("Please enter another favourite Movie: ").lower()
        n+=1
    
    return fav_movie

# Collaborative Filtering (CF)

The Collaborative Filtering Recommender is entirely based on the past behavior and not on the context.

## Simplest Model: Memory Based Approach, Item-Item Collaborative Filtering, N Neighbours

**This is NOT a Machine Learning Model**

> Memory Based Collaborative Filtering:
>
> Take a matrix of preferences for items by users using this matrix 
> to predict missing preferences and recommend items
> There are two types of memory based models:
>> 1) Item-Item Collaborative Filtering: “Users who liked this item also liked …”
>>
>> (i.e. starting from a given movie (or set of movies) we find similar movies based on other users’ preferences.)
>>
>> 2) User-Item Collaborative Filtering: “Users who are similar to you also liked …”
>>
>> (i.e. find users that have seen/rated similar content, and use their preferences to recommend new items)
>
> Memory Based Filtering Pros:
>> 1) Easy to implement
>>
>> 2) produce reasonable prediction quality
>
>Cons:
>> 1) It tends to recommend popular items.
>> 2) It suffers when new items that don’t have any ratings enter the system.

### 1. Create a Pivot Matrix 

We pivot the dataframe to have userId as rows and movieId as columns. 

In [7]:
df_movie_pivot = df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)
df_movie_pivot

ValueError: Index contains duplicate entries, cannot reshape

### 2.  Sparse matrix

A lot of values n the above matrix are zero, i.e. the data is extremely sparse.

> A sparse matrix is a matrix that is comprised of mostly zero values.

> Convert into an Array Matrix by using csr_matrix


In [None]:
from scipy.sparse import csr_matrix
feature_matrix=csr_matrix(df_movie_pivot.values)

### 3. Build the Model

In [None]:
from sklearn.neighbors import NearestNeighbors
#We only consider 10 nearest neighbours
n_neigh=10
model_knn_rating=NearestNeighbors(metric="cosine",n_neighbors=n_neigh)
model_knn_rating.fit(feature_matrix)

### 4. Create The Function to recommend movies

In [None]:
def recommend_movie_by_rating(query_title):
    
    #Find the movie ID w.r.t the the title
    query_index=df[df['title']==query_title]['movieId'].unique()[0]
    
    distances,indices=model_knn_rating.kneighbors(
        df_movie_pivot.iloc[query_index,:].values.reshape(1,-1)
    )
    
    MovieIds=[]
    CosDistance=[]
    ratedBy=[]
    
    for i in range(0,len(distances.flatten())):
        movieId=df_movie_pivot.index[indices.flatten()[i]]
        MovieIds.append(movieId)
    
        CosDistance.append(distances.flatten()[i])
        ratedBy.append(len(df[df['movieId']==movieId]['rating']))
        
    df_out=df[df.movieId.isin(MovieIds)].drop(["userId"],axis=1)
    
    df_out=df_out.groupby(['movieId','title',"year","genres"]).mean()
    df_out['average rating']=df_out['rating'].round(2)
    df_out=df_out.drop(['rating'],axis=1)
    df_out['number of ratings']=ratedBy
    df_out['Cosine Distance, ML Model']=CosDistance
    df_out['Cosine Distance, ML Model']=df_out['Cosine Distance, ML Model'].round(2)
    print("Because You like the movie",query_title,"we'd recommend you to watch:")
    display(df_out.head(10))

    #Results are automatically sorted as in the pivot table the indices are the movie ids

### 5. Test 

In [None]:
input_movie=AskForUserInput_0()
recommend_movie_by_rating(input_movie)

### 6. Save Your Model

In [None]:
import pickle
filename = 'Model_1_NearestNeighbours_ratingbased.sav'
pickle.dump(model_knn_rating, open(filename, 'wb'))