In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_data = pd.read_csv('movies.csv')
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
rating_data = pd.read_csv('ratings.csv')
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
avg_rating = rating_data.groupby('movieId').mean()
avg_rating = avg_rating.drop(['userId','timestamp'], axis = 1)
avg_rating.columns = ['avg_rating']
avg_rating = avg_rating.reset_index()
avg_rating.head()

Unnamed: 0,movieId,avg_rating
0,1,3.87247
1,2,3.401869
2,3,3.161017
3,4,2.384615
4,5,3.267857


In [5]:
vote_count = rating_data.groupby('movieId').count()
vote_count = vote_count.drop(['userId','timestamp'], axis = 1)
vote_count.columns = ['Total.Vote.Count']
vote_count = vote_count.reset_index()
vote_count.head()

Unnamed: 0,movieId,Total.Vote.Count
0,1,247
1,2,107
2,3,59
3,4,13
4,5,56


In [6]:
data = movies_data.merge(avg_rating, on = 'movieId', how = 'outer')
data = data.merge(vote_count, on = 'movieId', how = 'outer')
data.head()

Unnamed: 0,movieId,title,genres,avg_rating,Total.Vote.Count
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.87247,247.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.401869,107.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.161017,59.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.384615,13.0
4,5,Father of the Bride Part II (1995),Comedy,3.267857,56.0


In [7]:
data['genres'].head()

0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
4                                         Comedy
Name: genres, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
data['genres'] = data['genres'].fillna('')

tfidf_matrix = tfidf.fit_transform(data['genres'])
tfidf_matrix.shape

(9125, 24)

In [9]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print (cosine_sim)

[[ 1.          0.80403568  0.15641664 ...,  0.          0.26646851  0.        ]
 [ 0.80403568  1.          0.         ...,  0.          0.          0.        ]
 [ 0.15641664  0.          1.         ...,  0.          0.58699859  0.        ]
 ..., 
 [ 0.          0.          0.         ...,  1.          0.          1.        ]
 [ 0.26646851  0.          0.58699859 ...,  0.          1.          0.        ]
 [ 0.          0.          0.         ...,  1.          0.          1.        ]]


In [10]:
indices = pd.Series(data.index, index=data['title']).drop_duplicates()

In [11]:
def get_recommendations(title, cosine_sim = cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return data['title'].iloc[movie_indices]

In [12]:
get_recommendations('Battleship (2012)')

6733                             Transformers (2007)
7756                         I Am Number Four (2011)
8039                               Battleship (2012)
8309                               Iron Man 3 (2013)
8385                                  Riddick (2013)
8405                                  Gravity (2013)
8560                 The Amazing Spider-Man 2 (2014)
8596                         Edge of Tomorrow (2014)
7887                                Contagion (2011)
4603                     Matrix Reloaded, The (2003)
4880                  Matrix Revolutions, The (2003)
6691                             Spider-Man 3 (2007)
7526                               Iron Man 2 (2010)
8284                    G.I. Joe: Retaliation (2013)
6840                              I Am Legend (2007)
7617                 Resident Evil: Afterlife (2010)
7904                               Real Steel (2011)
8346                                  Elysium (2013)
7108           Day the Earth Stood Still, The 

In [13]:
get_recommendations('G.I. Joe: Retaliation (2013)')

4880                       Matrix Revolutions, The (2003)
6691                                  Spider-Man 3 (2007)
7526                                    Iron Man 2 (2010)
8284                         G.I. Joe: Retaliation (2013)
4103    Star Wars: Episode II - Attack of the Clones (...
5485                                  Spider-Man 2 (2004)
6461                              Superman Returns (2006)
7218                                     Star Trek (2009)
7257           Transformers: Revenge of the Fallen (2009)
7412                                        Avatar (2009)
7692                                  Tron: Legacy (2010)
7890                                 Avengers, The (2012)
8018                                   John Carter (2012)
8089                       Amazing Spider-Man, The (2012)
8301                                      Oblivion (2013)
8318                       Star Trek Into Darkness (2013)
8331                                   After Earth (2013)
8343          

In [14]:
get_recommendations('Titanic (1997)')

24                            Leaving Las Vegas (1995)
27                                   Persuasion (1995)
33                                   Carrington (1995)
44                How to Make an American Quilt (1995)
47                        When Night Is Falling (1995)
69                                 Bed of Roses (1996)
77     Once Upon a Time... When We Were Colored (1995)
79                           Angels and Insects (1995)
97               Bridges of Madison County, The (1995)
117                           Frankie Starlight (1995)
121                       Up Close and Personal (1996)
158                                    Mad Love (1995)
169                         Scarlet Letter, The (1995)
178                               Total Eclipse (1995)
183                       Walk in the Clouds, A (1995)
189                              Before Sunrise (1995)
195                           Circle of Friends (1995)
221                            Immortal Beloved (1994)
242       

In [15]:
get_recommendations('Live Free or Die Hard (2007)')

412                         Hard Target (1993)
4361                        Extreme Ops (2002)
4765         Once Upon a Time in Mexico (2003)
5237                      Warriors, The (1979)
6244                      Into the Blue (2005)
6690                     Condemned, The (2007)
6730              Live Free or Die Hard (2007)
7095                      Transporter 3 (2008)
7778                Hobo with a Shotgun (2011)
4432       City of God (Cidade de Deus) (2002)
7563                     24: Redemption (2008)
4672    Charlie's Angels: Full Throttle (2003)
5631                   After the Sunset (2004)
7613                            Machete (2010)
8930                            Spectre (2015)
6720                         Nancy Drew (2007)
408                        Getaway, The (1994)
6722                        Death Proof (2007)
5                                  Heat (1995)
22                            Assassins (1995)
Name: title, dtype: object

In [16]:
import lime
import lime.lime_tabular

In [19]:
explainer = lime.lime_tabular.LimeTabularExplainer(data.values)

TypeError: can't multiply sequence by non-int of type 'float'