# MOVIE RECOMMENDER

## My goal is to make a function which would take a name of any movie as input  and give output  some recommended movies corresponding to the input.

### At first I import the necessary libraries for this project 

In [2]:
import numpy as np
import pandas as pd

## Import the movie.csv data set which contains names of movies corresponding to each unique movieids

In [3]:
mv = pd.read_csv('movie.csv')

In [4]:
mv['movieId'].nunique()

27278

## There are 27278 unique movies in this dataset

In [5]:
mv.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


##  Import the ratings.csv file wich contains the columns : userId, movieId,rating given by the particular user and timestamp

In [6]:
ratings = pd.read_csv('rating.csv')

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [9]:
ratings.shape

(20000263, 4)

## There are in total 26744 movieIds and 138493 unique userids 

In [10]:
ratings['movieId'].nunique()

26744

In [11]:
ratings['userId'].nunique()

138493

## For the model to perform even better I will take into account only those users who are frequent movie watchers and has rated over 220 movies

In [12]:
x = ratings['userId'].value_counts()>220

In [13]:
critics = x[x].index.tolist()

### Below is the number of users who has rated over 220 movies.
### critics contains userid of all these users 

In [14]:
len(critics)

23666

## By the below operation I am modifying the ratings dataset, by keeping only those rows that that has userid in the  the list, critics

In [15]:
ratings = ratings[ratings['userId'].isin(critics)]

In [16]:
ratings.shape

(11810519, 4)

In [17]:
ratings['movieId'].nunique()

26514

## Further I would like to recomend only good movies to the users, hence in ratings dataset i would like to keep only those instances where any particular user has rated the move greater than or equal to 3.6 

In [18]:
ratings = ratings[ratings['rating']>=3.6]
ratings.shape

(5373155, 4)

## I create a dataset , 'new' , by merging the datasets ratings and mv(contains movie names). By this way each userId is mapped to the corresponding movie name which he/she has rated. 

In [19]:
new = ratings.merge(mv, on = 'movieId')

In [20]:
new.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,7,11,4.0,2002-01-16 19:04:49,"American President, The (1995)",Comedy|Drama|Romance
1,54,11,5.0,2000-11-21 21:02:08,"American President, The (1995)",Comedy|Drama|Romance
2,58,11,4.5,2006-04-03 10:04:34,"American President, The (1995)",Comedy|Drama|Romance
3,91,11,4.0,2005-07-18 08:01:15,"American President, The (1995)",Comedy|Drama|Romance
4,156,11,5.0,2002-12-06 19:49:14,"American President, The (1995)",Comedy|Drama|Romance


In [21]:
new.shape

(5373155, 6)

## In those instances where any user has rated any movie more than once, I will keep only one and drop the remaining.

In [22]:
new= new.drop_duplicates(subset = ['userId', 'movieId'])

In [23]:
new.shape

(5373155, 6)

In [24]:
new['movieId'].nunique()

20147

## I would like to consider only the famous movies out of the 20147 movies present in our dataframe that would be recomended. For this, I would keep only those movies which has been rated over 220 times.   

In [25]:
x =new['movieId'].value_counts()>220


## Below I see that there are 3369 such movies out of total 20147 that are rated over 220 times.I store the movieids in the list , 'frequent'.

In [26]:
x[x].shape

(3369,)

In [27]:
new['movieId'].nunique()

20147

In [28]:
new['userId'].nunique()

23665

In [29]:
frequent =x[x].index.tolist()

In [30]:
new = new[new['movieId'].isin(frequent)]

In [31]:
new.shape

(4887204, 6)

## Here I make a pivot table with index as the movie title, columns as the userids, and values are the ratings given to any movie by a particular user 

In [32]:
pivot= new.pivot_table(columns = 'userId', index ='title',values = 'rating')

In [33]:
pivot.head()

userId,7,11,14,24,31,54,58,69,82,91,...,138456,138457,138459,138464,138467,138472,138474,138477,138483,138493
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",,,,,,,,,,,...,,,,,,,4.0,,,
(500) Days of Summer (2009),,,,,,,,,,,...,,,,,,,,,,
*batteries not included (1987),,5.0,,,,,,,,,...,,,,,,,,,,
...And Justice for All (1979),,,,,,4.0,,,,,...,,,,,,,,,,
10 Things I Hate About You (1999),,5.0,,,,,,,,,...,4.5,,,5.0,,,4.0,,,


## This pivot table has total 3369 rows(number of movies) and 23665 columns(number of users). 

In [34]:
pivot.shape

(3369, 23665)

## The pivot table contains many NaN(Not a number) values .So for simplicity and to get rid of any further ambiguity in the model I repace all the NaN values with '0'. 

In [35]:
pivot.fillna(0, inplace = True)

In [36]:
from scipy.sparse import csr_matrix

## I convert the datas in the pivot table into sparse matrix so that it can be used to train the model. 

In [37]:
mat =csr_matrix(pivot)

In [38]:
mat.shape

(3369, 23665)

## The data is now ready to be feeded into the model. 

## Since this is a non-supervised learning ,I need to perform clustering over this data. For this reason I will use the class NearestNeighbor from sklearn.neighbors, that will help me to cluster the data and build a model.

In [39]:
from sklearn.neighbors import NearestNeighbors

## I will use the brute force algorithm because it utilizes the full computing power , by trying as much combinations as possible before reaching an optimal solution.

In [40]:
nn = NearestNeighbors(algorithm = 'brute')

## Feeding the model with data and getting it trained over this data 

In [41]:
nn.fit(mat)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

## Since the model,'nn' is ready,  I make a function, recomend.


## Working of the function:
## The function takes a string input(the movie name from the user).It is compared with each and every movie in the index of the pivot table by alphabet matching.If there is a match. the movie name is printed at first frm the pivot table. then it finds out the index positon of the movie in the pivot table.Then the whole row at that index positon is passed into the model .index positons of the nine nearest neighbors to that movie(including the movie itself) gets stored in the a 2D array,'s'.Corresponding to the indexes in s, the recommended movies get printed out from the index of the pivot table.

## If the movie given by the user is not found among the movies in the pivot table, the system simply prints out:-'Movie not present in the system'. 

In [42]:
def recomend(movie):
    found =0
    if movie!='':
        for p in pivot.index:
            if movie.lower() in p.lower():
                found = 1
                print('Movie:',p,'\n')
                ind = list(pivot.index).index(p)
                d, s = nn.kneighbors(pivot.iloc[ind,:].values.reshape(1,23665), n_neighbors =9)
                print("Movies You may like :\n")
                for i in s[0,1:]:
                    print(pivot.index[i])
                break  
    if found==0:
        print('Movie not present in the system !')
            

## model is ready to be used .By runing the below cell , the user can input any movie of his/her choice, and get recommendations of 8 similar movies.
## Here is a test case.

In [43]:
mov = input('Enter a movie name: ')
recomend(mov)

Enter a movie name: iron man 2
Movie: Iron Man 2 (2010) 

Movies You may like :

Captain America: The First Avenger (2011)
Thor (2011)
Iron Man 3 (2013)
Thor: The Dark World (2013)
A-Team, The (2010)
Transformers: Revenge of the Fallen (2009)
Man of Steel (2013)
Amazing Spider-Man, The (2012)
