## Preprocessing

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### now let's make each DataFrames

In [46]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### The above data frame looks a bit messy and it is better to sort this data frame by separating the release date of each movie.

#### Let's also remove the year from the title column

In [3]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')

movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


### We will need the genres of each movie, so to convert them into usable values, it is better to convert them into a list

In [4]:
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


### Now, using the numbers 1 and 0, we can show whether the desired movie has the corresponding genre or not. To do this, we use the following code.

In [5]:
moviesWithGenres_df = movies_df.copy()


for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1

moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Now that we have applied the required changes to the movie dataframe, let's look at the ratings dataframe. 

In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### In this dataframe, the timestamp column is not useful for us, so it is better to remove it from the dataframe.

In [7]:
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

  ratings_df = ratings_df.drop('timestamp', 1)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# Content-Based recommendation system

### Now we can create a content-based recommender system first by creating a hypothetical profile for the user to learn his interests.

In [19]:
userInput = [
            {'title':'Grumpier Old Men', 'rating':4.5},
            {'title':'Waiting to Exhale', 'rating':4},
            {'title':'Jumanji', 'rating':2},
            {'title':'Father of the Bride Part II', 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Grumpier Old Men,4.5
1,Waiting to Exhale,4.0
2,Jumanji,2.0
3,Father of the Bride Part II,5.0
4,Akira,4.5


### Now, by adding the ID of each movie to the dataframe, we will make it more useful.

In [20]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]

inputMovies = pd.merge(inputId, inputMovies)

inputMovies = inputMovies.drop('genres', 1).drop('year', 1)

inputMovies

  inputMovies = inputMovies.drop('genres', 1).drop('year', 1)
  inputMovies = inputMovies.drop('genres', 1).drop('year', 1)


Unnamed: 0,movieId,title,rating
0,2,Jumanji,2.0
1,3,Grumpier Old Men,4.5
2,4,Waiting to Exhale,4.0
3,5,Father of the Bride Part II,5.0
4,1274,Akira,4.5


## Now we learn the user's interests by matching the movies that the user has already viewed and rated with the previous dataset and also checking the genres of those movies.

In [21]:
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
973,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Now, we remove duplicate columns to clean our dataframe.

In [22]:
userMovies = userMovies.reset_index(drop=True)
userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
userGenreTable

  userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)


Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Now we're ready to start learning the input's preferences!
#### To do this, we assign a value to each genre based on user interest

In [23]:
inputMovies['rating']

0    2.0
1    4.5
2    4.0
3    5.0
4    4.5
Name: rating, dtype: float64

In [24]:
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])

userProfile

Adventure              6.5
Animation              4.5
Children               2.0
Comedy                13.5
Fantasy                2.0
Romance                8.5
Drama                  4.0
Action                 4.5
Crime                  0.0
Thriller               0.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
War                    0.0
Musical                0.0
Documentary            0.0
IMAX                   0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

### Now, based on the estimated values, we can suggest movies to the user.
Let's start by extracting the genre table from the original dataframe:

In [25]:
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])

genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
genreTable.head()

  genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)


Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
genreTable.shape

(9742, 20)

## Now,with knowing the user's interest in each genre and also the dataset of all the movies along with their genres, we can suggest the top 20 movies suitable for the user.

In [27]:
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

movieId
1    0.57
2    0.21
3    0.44
4    0.52
5    0.27
dtype: float64

In [28]:
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)

recommendationTable_df.head()

movieId
69644     0.79
47404     0.79
108540    0.78
1907      0.78
4956      0.74
dtype: float64

### Now we display the suggested movies in the form of a table.

In [29]:
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
1390,1907,Mulan,"[Adventure, Animation, Children, Comedy, Drama...",1998
3194,4306,Shrek,"[Adventure, Animation, Children, Comedy, Fanta...",2001
3608,4956,"Stunt Man, The","[Action, Adventure, Comedy, Drama, Romance, Th...",1980
5160,8360,Shrek 2,"[Adventure, Animation, Children, Comedy, Music...",2004
5476,26236,"White Sun of the Desert, The (Beloe solntse pu...","[Action, Adventure, Comedy, Drama, Romance, War]",1970
5572,26776,Porco Rosso (Crimson Pig) (Kurenai no buta),"[Adventure, Animation, Comedy, Fantasy, Romance]",1992
6047,40339,Chicken Little,"[Action, Adventure, Animation, Children, Comed...",2005
6094,42015,Casanova,"[Action, Adventure, Comedy, Drama, Romance]",2005
6267,47404,Mind Game,"[Adventure, Animation, Comedy, Fantasy, Romanc...",2004
6455,52287,Meet the Robinsons,"[Action, Adventure, Animation, Children, Comed...",2007


# Advantages of using a content-based recommender system:
### 1. It is personalized according to the user's interests
### 2. It learns the user's interests accurately.
# Disadvantages of using a content-based recommender system
### 1. It does not consider other people's opinions about the movie.
### 2. Extracting data is not always intuitive

# Collaborative Filtering

In [116]:
movies_df = pd.read_csv('movies.csv')

ratings_df = pd.read_csv('ratings.csv')

In [59]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [117]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')

movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


### In this part, we pay attention to the similarity of people's tastes to each other, so the genre of movies is no longer applicable.

In [63]:
movies_df = movies_df.drop('genres', 1)
movies_df.head()

  movies_df = movies_df.drop('genres', 1)


Unnamed: 0,movieId,title,year
0,1,Toy Story,
1,2,Jumanji,
2,3,Grumpier Old Men,
3,4,Waiting to Exhale,
4,5,Father of the Bride Part II,


### The movies dataframe is acceptable, we will go to the ratings data frame.

In [64]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### The timestamp column is not useful for us, so we delete it

In [65]:
ratings_df = ratings_df.drop('timestamp', 1)

  ratings_df = ratings_df.drop('timestamp', 1)


In [66]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


## To start this process, we first create a profile for a hypothetical user, this profile includes videos that the user has already viewed and rated. Then we can find similar people in this way.

In [67]:
userInput = [
            {'title':'Grumpier Old Men', 'rating':4.5},
            {'title':'Waiting to Exhale', 'rating':4},
            {'title':'Jumanji', 'rating':2},
            {'title':'Father of the Bride Part II', 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Grumpier Old Men,4.5
1,Waiting to Exhale,4.0
2,Jumanji,2.0
3,Father of the Bride Part II,5.0
4,Akira,4.5


### Add movieId to input user

In [69]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]

inputMovies = pd.merge(inputId, inputMovies)

inputMovies = inputMovies.drop('year', 1)

inputMovies

  inputMovies = inputMovies.drop('year', 1)


Unnamed: 0,movieId,title,rating
0,2,Jumanji,2.0
1,3,Grumpier Old Men,4.5
2,4,Waiting to Exhale,4.0
3,5,Father of the Bride Part II,5.0
4,1274,Akira,4.5


### Now we can find people who watched similar movies by adding movie IDs together.

In [70]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
1,1,3,4.0
560,6,2,4.0
561,6,3,5.0
562,6,4,3.0
563,6,5,5.0


We now group up the rows by user ID.

In [71]:
userSubsetGroup = userSubset.groupby(['userId'])

In [89]:
userSubsetGroup.get_group(6)

Unnamed: 0,userId,movieId,rating
560,6,2,4.0
561,6,3,5.0
562,6,4,3.0
563,6,5,5.0


### Now we want to sort the people in order of similarity.

In [90]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [91]:
userSubsetGroup[0:3]

[(6,
       userId  movieId  rating
  560       6        2     4.0
  561       6        3     5.0
  562       6        4     3.0
  563       6        5     5.0),
 (414,
         userId  movieId  rating
  62295     414        2     3.0
  62296     414        3     4.0
  62297     414        5     2.0
  62769     414     1274     4.0),
 (600,
         userId  movieId  rating
  95102     600        2     4.0
  95103     600        4     1.5
  95104     600        5     2.5
  95306     600     1274     3.5)]

## Calculate the degree of similarity

In [95]:
userSubsetGroup = userSubsetGroup[0:100]

In [96]:
from math import sqrt



pearsonCorrelationDict = {}


for name, group in userSubsetGroup:

    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    nRatings = len(group)

    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    tempRatingList = temp_df['rating'].tolist()

    tempGroupList = group['rating'].tolist()

    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    

    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [99]:
pearsonCorrelationDict

{6: 0.43023720500897183,
 414: 0.0,
 600: -0.5287333104889307,
 68: -0.9878291611472653,
 91: 0.49999999999999967,
 117: 0,
 169: 0.9878291611472599,
 288: 0.3592106040535495,
 321: -0.987829161147262,
 448: 0,
 470: 0,
 474: -0.9843241382880905,
 477: 0.0,
 480: 0.18898223650461252,
 483: -0.8485552916276654,
 501: 0.3592106040535507,
 590: -0.15554275420956382,
 599: 0.24019223070763077,
 608: 0.5000000000000004,
 19: 0,
 43: 0,
 51: -1.0,
 58: 1.0,
 84: 0,
 103: 0,
 107: -1.0,
 149: 0,
 150: 0,
 177: -1.0,
 217: -1.0,
 219: 0,
 226: 1.0,
 240: -1.0,
 269: -1.0,
 270: 0,
 274: 1.0,
 276: -1.0,
 294: -1.0,
 298: 1.0,
 307: 1.0,
 308: -1.0,
 318: 1.0,
 330: 1.0,
 337: 0,
 353: -1.0,
 411: -1.0,
 434: 1.0,
 456: 0,
 458: -1.0,
 489: -1.0,
 492: -1.0,
 561: -1.0,
 594: 0,
 604: -1.0,
 1: 0,
 8: 0,
 14: 0,
 18: 0,
 20: 0,
 21: 0,
 23: 0,
 27: 0,
 31: 0,
 32: 0,
 42: 0,
 44: 0,
 45: 0,
 50: 0,
 57: 0,
 62: 0,
 64: 0,
 66: 0,
 82: 0,
 93: 0,
 94: 0,
 95: 0,
 100: 0,
 102: 0,
 104: 0,
 105: 

In [100]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.430237,6
1,0.0,414
2,-0.528733,600
3,-0.987829,68
4,0.5,91


## Top 50 users by similarity

In [102]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
46,1.0,434
31,1.0,226
35,1.0,274
38,1.0,298
39,1.0,307


### Weighing the ratings of selected users

In [103]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,434,1,4.0
1,1.0,434,2,2.5
2,1.0,434,6,4.0
3,1.0,434,10,3.5
4,1.0,434,32,3.5


## Combined calculation of the interest rate of our user according to the ratings of other users and the degree of our similarity to them

In [104]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,434,1,4.0,4.0
1,1.0,434,2,2.5,2.5
2,1.0,434,6,4.0,4.0
3,1.0,434,10,3.5,3.5
4,1.0,434,32,3.5,3.5


In [105]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,8.776214,32.099202
2,10.565662,28.135746
3,7.565662,26.655971
4,0.430237,1.290712
5,3.136488,12.886385


In [106]:
recommendation_df = pd.DataFrame()

recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.657523,1
2,2.662942,2
3,3.523283,3
4,3.0,4
5,4.10854,5


### In this section, we will arrange the top 20 movies.

In [112]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(20)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
78836,5.0,78836
117531,5.0,117531
3837,5.0,3837
7381,5.0,7381
914,5.0,914
6993,5.0,6993
4256,5.0,4256
7318,5.0,7318
6345,5.0,6345
72171,5.0,72171


In [118]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres,year
696,914,My Fair Lady,Comedy|Drama|Musical|Romance,1964
2869,3837,Phantasm II,Action|Fantasy|Horror|Sci-Fi|Thriller,1988
3164,4256,"Center of the World, The",Drama,2001
4344,6345,"Chorus Line, A",Comedy|Drama|Musical,1985
4683,6993,Hannah and Her Sisters,Comedy|Drama|Romance,1986
4885,7318,"Passion of the Christ, The",Drama,2004
4924,7381,"Whole Ten Yards, The",Action|Comedy|Crime,2004
7177,72171,Black Dynamite,Action|Comedy,2009
7364,78836,Enter the Void,Drama,2009
8591,117531,Watermark,Documentary,2014
