In [1]:
import pandas as pd
import numpy as np


In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [3]:
movies.sample(5)

Unnamed: 0,movieId,title,genres
1422,1835,City of Angels (1998),Drama|Fantasy|Romance
2367,2948,From Russia with Love (1963),Action|Adventure|Thriller
5034,7163,Paycheck (2003),Action|Sci-Fi|Thriller
1721,2166,Return to Paradise (1998),Crime|Drama|Romance|Thriller
2579,3207,"Snows of Kilimanjaro, The (1952)",Adventure


In [4]:
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
75139,522,367,3.0,1391353098
32426,234,5349,3.0,1213808437
69896,481,111235,3.5,1437105983
7059,41,6951,4.0,1093888293
50900,376,4432,4.0,1005530069


In [5]:
ratings.drop(['timestamp'], axis=1, inplace=True)

In [6]:
movies.drop(['genres'], axis=1, inplace=True)

In [7]:
movies.loc[movies['movieId'].isin([251])]

Unnamed: 0,movieId,title
223,251,"Hunted, The (1995)"


In [8]:
def find_movies_by_id(id):
    movie = movies.loc[movies['movieId'].isin([id])]
    return movie.iloc[0]['title']

In [9]:
x = find_movies_by_id(10)
print(x)

GoldenEye (1995)


In [11]:
movies_matrix = ratings.pivot_table(
    index=['userId'], 
    columns=['movieId'],
    values='rating'
)
movies_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,3.0,,,,,,,,,3.0,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,4.0,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


**Pearson Correlation**
</br>
<img src='https://camo.githubusercontent.com/71fc70ee82901340842dc3beb3f351a42ade5a08/68747470733a2f2f77696b696d656469612e6f72672f6170692f726573745f76312f6d656469612f6d6174682f72656e6465722f7376672f34333231393236356463326338323763623466356233346632653366623739376265643265383230'/>

In [12]:
def pearson(s1, s2):
    s1_c = s1-s1.mean()
    s2_c = s2-s2.mean()
    return np.sum(s1_c*s2_c)/np.sqrt(np.sum(s1_c**2)*np.sum(s2_c**2))

In [13]:
def get_recommendation(movieID, matrix, num):
    reviews = []
    for movie in matrix.columns:
        if movie == movieID:
            continue
        cor = pearson(matrix[movieID], matrix[movie])
        if np.isnan(cor):
            continue
        else:
            reviews.append((movie, cor))
        reviews.sort(key=lambda tup: tup[1], reverse=True)
    return reviews[:num]

In [14]:
def recommendation(movieID, num):
    try:
        recs = get_recommendation(movieID, movies_matrix, num)
        if not recs:
            print ('recommendation not found')
            return
        print('recommendation for ', find_movies_by_id(movieID))
        for r in recs:
            title =  find_movies_by_id(r[0])
            print('Title:', title, 'Cor: ', round(r[1],2))
    except KeyError:
        print ('Movie not found')

In [16]:
r = recommendation(175, 10)

  after removing the cwd from sys.path.


recommendation for  Kids (1995)
Title: Nekromantik (1987) Cor:  0.62
Title: Problem Child (1990) Cor:  0.59
Title: Heaven's Prisoners (1996) Cor:  0.56
Title: And the Band Played On (1993) Cor:  0.54
Title: Ghoulies II (1987) Cor:  0.54
Title: Misérables, Les (1995) Cor:  0.53
Title: Roommates (1995) Cor:  0.53
Title: Gods Must Be Crazy II, The (1989) Cor:  0.53
Title: Feeling Minnesota (1996) Cor:  0.53
Title: Traveller (1997) Cor:  0.53
