In [1]:
import pandas as pd

ratings_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('../ml-100k/u.data', sep='\t', names=ratings_cols, usecols=range(3), encoding="ISO-8859-1")

movie_cols = ['movie_id', 'title']
movies = pd.read_csv('../ml-100k/u.item', sep='|', names=movie_cols, usecols=range(2), encoding="ISO-8859-1")

ratings = pd.merge(movies, ratings)

In [2]:
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [6]:
movie_ratings = ratings.pivot_table(index=['user_id'], columns=['title'], values='rating')
movie_ratings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [19]:
star_wars_ratings = movie_ratings['Young Guns (1988)']
star_wars_ratings.head()

user_id
0    NaN
1    3.0
2    NaN
3    NaN
4    NaN
Name: Young Guns (1988), dtype: float64

In [20]:
similarToStarWars = movie_ratings.corrwith(star_wars_ratings)
# Drop blank values
similarToStarWars = similarToStarWars.dropna()
movies_df = pd.DataFrame(similarToStarWars)
movies_df.head(10)


  c = cov(x, y, rowvar)
  c *= 1. / np.float64(fact)


Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
101 Dalmatians (1996),0.119234
12 Angry Men (1957),0.068944
187 (1997),-0.5
2 Days in the Valley (1996),0.15622
"20,000 Leagues Under the Sea (1954)",-0.010894
2001: A Space Odyssey (1968),-0.174918
"39 Steps, The (1935)",-0.337691
8 1/2 (1963),-0.498527
8 Seconds (1994),1.0
Above the Rim (1994),0.816497


We need to get rid of movies that were only watched by a few people so let's construct a new DataFrame that counts up how many ratings exist for each movie and the average rating

In [21]:
import numpy as np

movie_stats = ratings.groupby('title').agg({
    'rating': [np.size, np.mean]
})
movie_stats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.6
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344
187 (1997),41,3.02439


In [22]:
popular_movies = movie_stats['rating']['size'] >= 100
movie_stats[popular_movies].sort_values([('rating', 'mean')], ascending=False)[:15]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Close Shave, A (1995)",112,4.491071
Schindler's List (1993),298,4.466443
"Wrong Trousers, The (1993)",118,4.466102
Casablanca (1942),243,4.45679
"Shawshank Redemption, The (1994)",283,4.44523
Rear Window (1954),209,4.38756
"Usual Suspects, The (1995)",267,4.385768
Star Wars (1977),584,4.359589
12 Angry Men (1957),125,4.344
Citizen Kane (1941),198,4.292929


100 might still be too low, but these results look pretty good as far as "well rated movies that people have heard of." Let's join this data with our original set of similar movies to Star Wars:


In [23]:
recommendations = movie_stats[popular_movies].join(pd.DataFrame(similarToStarWars, columns=['similarity']))



In [24]:
recommendations.head()

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101 Dalmatians (1996),109,2.908257,0.119234
12 Angry Men (1957),125,4.344,0.068944
2001: A Space Odyssey (1968),259,3.969112,-0.174918
Absolute Power (1997),127,3.370079,0.254324
"Abyss, The (1989)",151,3.589404,0.384703


In [25]:
recommendations.sort_values(['similarity'], ascending=False)[:15]

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Young Guns (1988),101,3.207921,1.0
Good Will Hunting (1997),198,4.262626,0.675082
"River Wild, The (1994)",146,3.143836,0.662424
"Frighteners, The (1996)",115,3.234783,0.629871
Seven Years in Tibet (1997),155,3.458065,0.599263
"Time to Kill, A (1996)",232,3.685345,0.597969
"Ghost and the Darkness, The (1996)",128,3.203125,0.585139
"Nightmare on Elm Street, A (1984)",111,3.171171,0.578651
"First Wives Club, The (1996)",160,3.01875,0.569725
Con Air (1997),137,3.459854,0.557001
