# Content-Based Model:

Importing needed packages:

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Reading the files:

In [38]:
movies = pd.read_csv('movies.csv')
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


<h2> Cleaning the Data </h2>

At first we must cleaning the data from the title of films (the year of the film) using **RegEx**:

In [39]:
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')
movies['title'] = movies['title'].apply(lambda x: x.strip())
movies['genres'] = movies.genres.str.split('|')
movies.head()

  movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieId,title,genres
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men,"[Comedy, Romance]"
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II,[Comedy]


In [40]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [41]:
# Deleting the "timestamp" column:
ratings = ratings.drop('timestamp', 1)
ratings.head()

  ratings = ratings.drop('timestamp', 1)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


<h3> Normalizing the Dataframe Matrix </h3>

I have to using Encoding Technique to convert the list of **Geners** into a Matrix:

In [42]:
movies_genres = movies.copy()

# Setting "1.0" for genres of the film
for i, row in movies.iterrows():
    for genre in row['genres']:
        movies_genres.at[i, genre] = 1

# 0 for "nan" values
movies_genres = movies_genres.fillna(0)
movies_genres.head()

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


These are some movies that I really had watched them before:

In [43]:
myFilms = [
        {'title':'Murder in the First', 'rating':3.5},
        {'title':'Once Were Warriors', 'rating':4},
        {'title':'Don Juan DeMarco', 'rating':4},
        {'title':'Clear and Present Danger', 'rating':2},
        {'title':'Forrest Gump', 'rating':5},
        {'title':'Public Enemies', 'rating':3},
        {'title':'Harry Potter and the Half-Blood Prince', 'rating':4},
        {'title':'Sherlock Holmes', 'rating':4.5},
        {'title':'Ghost Rider: Spirit of Vengeance', 'rating':3},
        {'title':'Resident Evil: Retribution', 'rating':3.5},
        {'title':'Lincoln', 'rating':4},
        {'title':'Hobbit: An Unexpected Journey, The', 'rating':4},
        {'title':'Jack Reacher', 'rating':2},
        {'title':'Batman: The Dark Knight Returns, Part 2', 'rating':4.5},
        {'title':'Creed', 'rating':3.5},
        {'title':"Daddy's Home 2", 'rating':2.5},
        {'title':'The Death of Stalin', 'rating':3},
        {'title':'Mulholland Dr', 'rating':4.5},
    ]

# convert it to a readable dataframe
inputMovies = pd.DataFrame(myFilms)
inputMovies

Unnamed: 0,title,rating
0,Murder in the First,3.5
1,Once Were Warriors,4.0
2,Don Juan DeMarco,4.0
3,Clear and Present Danger,2.0
4,Forrest Gump,5.0
5,Public Enemies,3.0
6,Harry Potter and the Half-Blood Prince,4.0
7,Sherlock Holmes,4.5
8,Ghost Rider: Spirit of Vengeance,3.0
9,Resident Evil: Retribution,3.5


Now I have to extract the **movie id's** of the InputMovies from "movies" dataframe. Acctually we are extracting the movies from "movies datafram" which are existing in "inputMovies datafram":

In [44]:
# "tolist()" method will convert the output into a list object
Id = movies[movies['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(Id, inputMovies)
# We won't use data from "genre" column, so I will drop this column.
inputMovies = inputMovies.drop('genres', 1)
inputMovies

  inputMovies = inputMovies.drop('genres', 1)


Unnamed: 0,movieId,title,rating
0,224,Don Juan DeMarco,4.0
1,280,Murder in the First,3.5
2,290,Once Were Warriors,4.0
3,349,Clear and Present Danger,2.0
4,356,Forrest Gump,5.0
5,69640,Public Enemies,3.0
6,69844,Harry Potter and the Half-Blood Prince,4.0
7,73017,Sherlock Holmes,4.5
8,92938,Ghost Rider: Spirit of Vengeance,3.0
9,96691,Resident Evil: Retribution,3.5


In [45]:
user_movies = movies_genres[movies_genres['movieId'].isin(inputMovies['movieId'].tolist())]
user_movies

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
191,224,Don Juan DeMarco,"[Comedy, Drama, Romance]",0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
242,280,Murder in the First,"[Drama, Thriller]",0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
251,290,Once Were Warriors,"[Crime, Drama]",0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307,349,Clear and Present Danger,"[Action, Crime, Drama, Thriller]",0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
314,356,Forrest Gump,"[Comedy, Drama, Romance, War]",0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7068,69640,Public Enemies,"[Crime, Drama, Thriller]",0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7078,69844,Harry Potter and the Half-Blood Prince,"[Adventure, Fantasy, Mystery, Romance, IMAX]",1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7214,73017,Sherlock Holmes,"[Action, Crime, Mystery, Thriller]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7824,92938,Ghost Rider: Spirit of Vengeance,"[Action, Fantasy, Thriller]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7982,96691,Resident Evil: Retribution,"[Action, Horror, Sci-Fi, IMAX]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [46]:
# Giving new index number to the rows for normalizing the index numbers
user_movies = user_movies.reset_index(drop=True)
genre_table = user_movies.drop('movieId', 1).drop('title', 1).drop('genres', 1)
genre_table

  genre_table = user_movies.drop('movieId', 1).drop('title', 1).drop('genres', 1)
  genre_table = user_movies.drop('movieId', 1).drop('title', 1).drop('genres', 1)
  genre_table = user_movies.drop('movieId', 1).drop('title', 1).drop('genres', 1)


Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [47]:
print(inputMovies['rating'])

0     4.0
1     3.5
2     4.0
3     2.0
4     5.0
5     3.0
6     4.0
7     4.5
8     3.0
9     3.5
10    4.0
11    4.0
12    2.0
13    4.5
14    3.5
15    3.0
16    2.5
Name: rating, dtype: float64


For calculating the preferences, I should multiply my ratings (as a vector) to Genres Matrix: (using **dot()** method)

In [48]:
userProfile = genre_table.transpose().dot(inputMovies['rating'])
userProfile

Adventure              8.0
Animation              4.5
Children               0.0
Comedy                14.5
Fantasy               11.0
Romance               13.0
Drama                 31.5
Action                19.5
Crime                 15.5
Thriller              18.0
Horror                 3.5
Mystery                8.5
Sci-Fi                 3.5
War                    9.0
Musical                0.0
Documentary            0.0
IMAX                  11.5
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

I have the points of each genre (User Profile). It is the time to make a recommendation:

In [49]:
genreTable = movies_genres.set_index(movies_genres['movieId'])
genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1)
genreTable.head()

  genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1)
  genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1)
  genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1)


Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
genreTable.shape

(9742, 20)

In [51]:
# This is the algorithm of calculating the recommendation point
recommendation_table = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
print(recommendation_table.head(10))

movieId
1     0.221574
2     0.110787
3     0.160350
4     0.344023
5     0.084548
6     0.309038
7     0.160350
8     0.046647
9     0.113703
10    0.265306
dtype: float64


In [52]:
recommendation_table = recommendation_table.sort_values(ascending=False)
print(recommendation_table.head())

movieId
81132    0.693878
4719     0.679300
79132    0.629738
459      0.615160
4956     0.609329
dtype: float64


In [53]:
# Represent it in movies dataframe
movies.loc[movies['movieId'].isin(recommendation_table.head(50).keys())]

Unnamed: 0,movieId,title,genres
19,20,Money Train,"[Action, Comedy, Crime, Drama, Thriller]"
118,145,Bad Boys,"[Action, Comedy, Crime, Drama, Thriller]"
167,198,Strange Days,"[Action, Crime, Drama, Mystery, Sci-Fi, Thriller]"
400,459,"Getaway, The","[Action, Adventure, Crime, Drama, Romance, Thr..."
454,519,RoboCop 3,"[Action, Crime, Drama, Sci-Fi, Thriller]"
963,1264,Diva,"[Action, Drama, Mystery, Romance, Thriller]"
1103,1432,Metro,"[Action, Comedy, Crime, Drama, Thriller]"
1330,1799,Suicide Kings,"[Comedy, Crime, Drama, Mystery, Thriller]"
1394,1912,Out of Sight,"[Comedy, Crime, Drama, Romance, Thriller]"
1526,2058,"Negotiator, The","[Action, Crime, Drama, Mystery, Thriller]"


<h1> Collaborative Filtering Model </h1>

In **Collaborative System**, we are looking for the **similarity** beetween users, So the genres are not important for our Model and I will omit them from my dataset: 

<h4> reading the data: </h4>

fortunately all of the datasets have been cleaned!

In [54]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men,"[Comedy, Romance]"
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II,[Comedy]


In [55]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


## User-Based:

For User-Based in Collaborative Recommendation System, we won't need **genre column**. Since we are going to consider the similarity between users, we won't use the features of the movie (like genre).

In [56]:
#Dropping genres column
movies = movies.drop('genres', 1)
movies.head()

  movies = movies.drop('genres', 1)


Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji
2,3,Grumpier Old Men
3,4,Waiting to Exhale
4,5,Father of the Bride Part II


In **Collaborative Filtering "User-Based"**, we are going to analyse what other user's have recommended to making a similarity algorithm between users. We try to find similar prefrences as the user's and recommend new items to the user.

One of the methods that can be used in this situation is **Pearson Correlation Function**.

These are the stages that ends to a User Based Recommendation System:

* 1. Making a user and adding the movie's name that he/she has watched
* 2. Find other users in the dataset that have watched some of the movies that the input user watched
* 3. Calculate the "Similarity Score" between input user and other users Based on user's ratings
* 4. Recommend the movies which have the highest Similarity Score to the user.

<img src="https://user-images.githubusercontent.com/46146748/63115930-5f6c1900-bf66-11e9-894f-ecde5ec531b0.png" width=600px>

In [57]:
userFilms = [
        {'title':'Murder in the First', 'rating':3.5},
        {'title':'Once Were Warriors', 'rating':4},
        {'title':'Don Juan DeMarco', 'rating':4},
        {'title':'Clear and Present Danger', 'rating':2},
        {'title':'Forrest Gump', 'rating':5},
        {'title':'Public Enemies', 'rating':3},
        {'title':'Harry Potter and the Half-Blood Prince', 'rating':4},
        {'title':'Sherlock Holmes', 'rating':4.5},
        {'title':'Ghost Rider: Spirit of Vengeance', 'rating':3},
        {'title':'Resident Evil: Retribution', 'rating':3.5},
        {'title':'Lincoln', 'rating':4},
        {'title':'Hobbit: An Unexpected Journey, The', 'rating':4},
        {'title':'Jack Reacher', 'rating':2},
        {'title':'Batman: The Dark Knight Returns, Part 2', 'rating':4.5},
        {'title':'Creed', 'rating':3.5},
        {'title':"Daddy's Home 2", 'rating':2.5},
        {'title':'The Death of Stalin', 'rating':3},
        {'title':'Mulholland Dr', 'rating':4.5},
    ]

# convert it to a readable dataframe
inputMovies = pd.DataFrame(userFilms)
inputMovies

Unnamed: 0,title,rating
0,Murder in the First,3.5
1,Once Were Warriors,4.0
2,Don Juan DeMarco,4.0
3,Clear and Present Danger,2.0
4,Forrest Gump,5.0
5,Public Enemies,3.0
6,Harry Potter and the Half-Blood Prince,4.0
7,Sherlock Holmes,4.5
8,Ghost Rider: Spirit of Vengeance,3.0
9,Resident Evil: Retribution,3.5


Now we have to extracting the movie id's of the "InputMovies" from "movies" dataset:

In [58]:
# Filtering the movies name and merge it with dataframe
Id = movies[movies['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(Id, inputMovies)
inputMovies

Unnamed: 0,movieId,title,rating
0,224,Don Juan DeMarco,4.0
1,280,Murder in the First,3.5
2,290,Once Were Warriors,4.0
3,349,Clear and Present Danger,2.0
4,356,Forrest Gump,5.0
5,69640,Public Enemies,3.0
6,69844,Harry Potter and the Half-Blood Prince,4.0
7,73017,Sherlock Holmes,4.5
8,92938,Ghost Rider: Spirit of Vengeance,3.0
9,96691,Resident Evil: Retribution,3.5


Let's make a subset of users that have wathced these films:

In [60]:
userSub = ratings[ratings['movieId'].isin(inputMovies['movieId'].tolist())]
userSub.head(10)

Unnamed: 0,userId,movieId,rating
19,1,349,4.0
20,1,356,4.0
532,5,290,5.0
538,5,349,3.0
654,6,224,3.0
718,6,349,5.0
725,6,356,5.0
880,7,356,5.0
1053,8,356,3.0
1120,10,356,3.5


Now I'm gonna group the usres by their id's:

In [66]:
user_Group = userSub.groupby(['userId'])
user_Group.get_group(6)

Unnamed: 0,userId,movieId,rating
654,6,224,3.0
718,6,349,5.0
725,6,356,5.0


**Sorting** the groups by the number of movies they have in commen with input user:

In [73]:
user_Group = sorted(user_Group, key=lambda x: len(x[1]), reverse=True)
print(user_Group[0:3])

[(414,        userId  movieId  rating
62394     414      224     4.0
62418     414      280     4.0
62421     414      290     5.0
62449     414      349     3.0
62454     414      356     5.0
64765     414    69640     3.0
64767     414    69844     3.5
64786     414    73017     4.0
64877     414    99112     3.5), (380,        userId  movieId  rating
56931     380      356     5.0
57830     380    69844     4.0
57844     380    73017     3.0
57912     380    92938     2.0
57946     380    98809     5.0
57948     380    99112     4.0
57950     380    99813     5.0
58087     380   179119     4.0), (448,        userId  movieId  rating
68702     448      349     4.0
68705     448      356     3.0
69817     448    69640     3.0
69843     448    73017     4.0
70087     448    98154     3.5
70095     448    98809     2.5
70101     448    99112     2.5)]


It's the time to compare the users to input user and find the similarity pattern with **Pearson Correlation Coeffcient**.

In [74]:
user_Group = user_Group[0:300]

In [76]:
from math import sqrt

pcd = {} # pcd stands for Pearson Correlation Dictionary

for name, group in user_Group:
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    
    Ratings = len(group)
    tempdf = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = tempdf['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(Ratings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(Ratings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(Ratings)
    

    if Sxx != 0 and Syy != 0:
        pcd[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pcd[name] = 0


In [78]:
print(pcd.items())

dict_items([(414, 0.7362926760338593), (380, 0.4008918628686366), (448, 0.05502437333491087), (177, 0.8777908163823711), (249, 0.9185586535436921), (18, 0.6621221919717307), (212, 0), (279, 0), (298, 0), (318, 0.6407116072277441), (365, 0.8126360553720013), (596, 0.829156197588853), (599, 0.30151134457776363), (602, -0.8675276172357121), (610, -0.3100868364730211), (28, 0.6622661785325219), (58, 0.3133397807202561), (62, 0.9045340337332909), (68, -0.674199862463242), (105, -0.2075143391598224), (117, 0.808290376865476), (232, -0.1690308509457033), (292, 0.5703518254720301), (305, 0.9864400504156211), (328, -0.5222329678670935), (381, 0.7324670207647144), (382, 0), (411, 0.2927700218845599), (466, 0.8528028654224417), (474, 0.2), (483, 0.8528028654224417), (489, 0.8783100656536799), (534, 0.2688664289689325), (561, 0.8783100656536799), (6, -0.18898223650461307), (10, 0.8660254037844355), (21, 0.24019223070763082), (52, 0.8660254037844387), (57, 0), (84, 0.7559289460184573), (103, 0.8660

In [80]:
pearson = pd.DataFrame.from_dict(pcd, orient='index')
pearson.columns = ['Similarity']
pearson['userId'] = pearson.index
pearson.index = range(len(pearson))
pearson.head(10)

Unnamed: 0,Similarity,userId
0,0.736293,414
1,0.400892,380
2,0.055024,448
3,0.877791,177
4,0.918559,249
5,0.662122,18
6,0.0,212
7,0.0,279
8,0.0,298
9,0.640712,318


Let's sort the most similarity score in dataset:

In [84]:
topuser = pearson.sort_values(by='Similarity', ascending=False)[0:50]
topuser.head()

Unnamed: 0,Similarity,userId
181,1.0,577
187,1.0,600
110,1.0,181
109,1.0,179
108,1.0,166


In [85]:
topUsersRating = topuser.merge(ratings, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,Similarity,userId,movieId,rating
0,1.0,577,6,4.0
1,1.0,577,39,3.0
2,1.0,577,88,3.0
3,1.0,577,110,4.0
4,1.0,577,141,3.0


Calculating the similarity rating of the movies. It will repreasent the rating of the candidate movies for user:

In [86]:
topUsersRating['Weighted Rating'] = topUsersRating['Similarity']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,Similarity,userId,movieId,rating,Weighted Rating
0,1.0,577,6,4.0,4.0
1,1.0,577,39,3.0,3.0
2,1.0,577,88,3.0,3.0
3,1.0,577,110,4.0,4.0
4,1.0,577,141,3.0,3.0


In [88]:
# Calculating the Summation of the candidate movie's ratings
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['Similarity','Weighted Rating']]
tempTopUsersRating.columns = ['Sum Similarity','Sum Weighted Rating']
tempTopUsersRating.head()

Unnamed: 0_level_0,Sum Similarity,Sum Weighted Rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,23.0,88.5
2,14.0,53.0
3,9.0,32.5
4,1.0,1.5
5,8.0,22.0


In [90]:
# Making a new empty dataframe for Weighted Average Recommendation Score
recommendation = pd.DataFrame()
recommendation['Recommendation Score'] = tempTopUsersRating['Sum Weighted Rating']/tempTopUsersRating['Sum Similarity']
recommendation['movieId'] = tempTopUsersRating.index
recommendation.head()

Unnamed: 0_level_0,Recommendation Score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.847826,1
2,3.785714,2
3,3.611111,3
4,1.5,4
5,2.75,5


Now we should sort the id's by Recommendation Score to observe the top Movies:

In [91]:
recommendation = recommendation.sort_values(by='Recommendation Score', ascending=False)
recommendation.head(20)

Unnamed: 0_level_0,Recommendation Score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3502,5.0,3502
1564,5.0,1564
51931,5.0,51931
3090,5.0,3090
932,5.0,932
933,5.0,933
70451,5.0,70451
1014,5.0,1014
8132,5.0,8132
112804,5.0,112804


Lets find the movie's name by locating the id's:

In [93]:
movies.loc[movies['movieId'].isin(recommendation.head(30)['movieId'].tolist())]

Unnamed: 0,movieId,title
90,102,Mr. Wrong
242,280,Murder in the First
421,484,Lassie
713,932,"Affair to Remember, An"
714,933,To Catch a Thief
772,1014,Pollyanna
781,1023,Winnie the Pooh and the Blustery Day
842,1105,Children of the Corn IV: The Gathering
853,1124,On Golden Pond
1175,1564,For Roseanna (Roseanna's Grave)
