In [4]:
import pandas as pd
import numpy as np 

In [5]:
#Reads in the movies dataframe
movies = pd.read_csv("movies.csv")

In [6]:
#Reads in the ratings dataframe, limiting to the first 30,000 just as a demo
ratings = pd.read_csv('ratings.csv')
ratings = ratings[:30000]
ratings.drop(['timestamp'], axis=1, inplace = True)

In [7]:
#Shows the dataset head from movies
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
#Shows the dataset head from ratings
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


In [9]:
#Function used to display movie names instead of movie ids inside the ratings dataframe

def replace_name(x):
    return movies[movies['movieId']==x].title.values[0]

ratings.movieId = ratings.movieId.map(replace_name)

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,Three Colors: Blue (Trois couleurs: Bleu) (1993),3.5
1,1,Kalifornia (1993),3.5
2,1,Weekend at Bernie's (1989),1.5
3,1,Better Off Dead... (1985),4.5
4,1,Waiting for Guffman (1996),4.5


In [11]:
#Creates a pivot table to display all the reviews that have been given to different movies by different users
M = ratings.pivot_table(index =['userId'],columns=['movieId'],values ='rating')


In [12]:
M

movieId,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Salem's Lot (2004),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),10 Cloverfield Lane (2016),...,Zootopia (2016),"Zorro, the Gay Blade (1981)",Zulu (1964),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,4.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,4.0,,,,...,,,,,,,3.5,3.5,,3.0
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,,,,,,,,,,,...,,,,,,,,,,
282,,,,,,,,,,,...,,,,,,,,,,
283,,,,,,,,,,,...,,,,,,,,,,
284,,,,,,,,,,,...,,,,,,,,,,


In [13]:
#Pearson's r shows correlation between data sets or points 
#1 for postively correlated, -1 for negatively correlated, and a 0 for no correlation whatsoever

def pearson(s1, s2):
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2)) 

In [14]:
#Recommendation function simulating based off of ratings from users

def get_recommendations(movie_name, M, n):
    
    reviews = []
    for title in M.columns:
        if title == movie_name:
            continue
        cor = pearson(M[movie_name], M[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((title, cor))
            
    reviews.sort(key=lambda tup: tup[1], reverse=True)
    return reviews[0:n]

In [15]:
#Asking for best 10 recommendations based off of a given movie 
best_recs = get_recommendations('Kalifornia (1993)', M, 10)
best_recs[:10]

  import sys


[('Lost Souls (2000)', 0.850657446010003),
 ("The Devil's Advocate (1997)", 0.652601518198835),
 ('Ernest Goes to Camp (1987)', 0.6446045883926708),
 ('28 Days (2000)', 0.5848269941318771),
 ('I Saw What You Did (1965)', 0.5848269941318771),
 ('Wild Things (1998)', 0.5281286864625693),
 ('So I Married an Axe Murderer (1993)', 0.5194467182444888),
 ('Unbreakable (2000)', 0.4327415784704615),
 ("Rosemary's Baby (1968)", 0.4320880340134802),
 ('Virgin Suicides, The (1999)', 0.431312268226673)]

In [16]:
#Asking for worst 10 recommendations based off of a given movie 
worst_recs = get_recommendations('Kalifornia (1993)', M, 6321)
worst_recs[-10:]

  import sys


[('Untamed Heart (1993)', -0.6446045883926708),
 ('Mannequin (1987)', -0.6574328161268042),
 ("Jacob's Ladder (1990)", -0.6753000449827979),
 ('Book of Shadows: Blair Witch 2 (2000)', -0.707223656409439),
 ('Dead Calm (1989)', -0.7602477813190633),
 ('Jaws: The Revenge (1987)', -0.7636059465697966),
 ('Billy Elliot (2000)', -0.7750534602557096),
 ('Innerspace (1987)', -0.7985982143983299),
 ("I'm Gonna Git You Sucka (1988)", -0.850657446010003),
 ('Return to Me (2000)', -0.850657446010003)]