In [85]:
import pandas as pd
import numpy as np

In [86]:
M = pd.read_csv('data/movie_reviews.csv', index_col='Name')

In [87]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [88]:
def get_recs(movie_name, M, num):

    import numpy as np
    reviews = []
    for title in M.columns:
        if title == movie_name:
            continue
        cor = pearson(M[movie_name], M[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((title, cor))
    
    reviews.sort(key=lambda tup: tup[1], reverse=True)
    return reviews[:num]

### Question 1:  What movie is most Similar to 'The fault in our stars'


In [89]:
recs = get_recs('The Fault in Our Stars', M, 1)
print("Most similar to 'The fault in our stars' is '{}'".format(recs[0][0]))

Most similar to 'The fault in our stars' is 'Malificent'


### Question 2:  Which movie(s) would you most like to see?

In [90]:
# Movies I've seen
seen_movies = M.ix['Anupom Syam'].dropna()
# Movie I've rated the highest
seen_movies.sort(ascending=False)
# The name of the movie I've rated the highest
most_fav_movie = seen_movies.index[0]
# Find similar movies to my favorite movie
similar_movies = get_recs(most_fav_movie, M, 3)
# Remove movies I have already watched 
similar_movies = [movie for movie in similar_movies if movie[0] not in seen_movies.index]
print('Movies I would most like to see: ')
for movie in similar_movies:
    print(movie[0])

Movies I would most like to see: 
Divergent
Malificent




### Question 3: For all the movies you haven't seen, can you predict how you'd rate them using your class reviews?

In [91]:
import math
unseen_movies = [movie for movie, rating in M.ix['Anupom Syam'].iteritems() if math.isnan(rating)]
for unseen_movie in unseen_movies:
    similar_movies = get_recs(unseen_movie, M, 10)
    # Get all similar movies with corelation > 0.1
    similar_movies = [movie for movie in similar_movies if movie[1] > 0.1]
    # Get only the movies I watched
    similar_movies = [movie for movie in similar_movies if movie[0] in seen_movies.index]
    predicted_rating = seen_movies[similar_movies[0][0]] if similar_movies else float('nan')
    print("Predicted rating for movie '{}' is {}".format(unseen_movie, predicted_rating))

Predicted rating for movie 'American Sniper' is 4.0
Predicted rating for movie 'The Hunger Games: Mockingjay - Part 1' is 3.0
Predicted rating for movie 'The Lego Movie' is 3.0
Predicted rating for movie 'Malificent' is 3.0
Predicted rating for movie 'Divergent' is 5.0
Predicted rating for movie 'The Fault in Our Stars' is 4.0
Predicted rating for movie 'Unbroken' is nan
