In [29]:
import pandas as pd
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
import sklearn.preprocessing as pp
import numpy as np
from sklearn.metrics import mean_squared_error

In [22]:
#Reading the four csv files in the MovieLens Downloaded data
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('../../../Downloads/ml-100k/u.user', sep='|', names=user_cols,encoding='latin-1')

rating_cols = ['user_id', 'movie_id','rating', 'timestamp']
ratings = pd.read_csv('../../../Downloads/ml-100k/u.data', sep='\t', names=rating_cols,encoding='latin-1')

movies_cols = ['movie_id','movie_title', 'release_date', 'video_release_date','IMDb_URL', 'unknown', 'Action', 
               'Adventure','Animation','Children', 'Comedy' , 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film_Noir',  'Horror',  'Musical', 'Mystery', 'Romance', 'Sci-Fi','Thriller','War','Western']
movies = pd.read_csv('../../../Downloads/ml-100k/u.item', sep='|', names=movies_cols,encoding='latin-1')


In [23]:
print(users.shape)
users.head()
print(ratings.shape)
ratings.head()
print(movies.shape)
movies.head()

(943, 5)
(100000, 4)
(1682, 24)


Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [24]:
#Splitting the data. Utilising the data split prior by the Movie Lens itself
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_train = pd.read_csv('../../../Downloads/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../../../Downloads/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [5]:
ratings_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [120]:
#Reshaping the ratings dataframe in the following format where the user and his preference are listed as below
#The rows are the users and the columns the movies
len(set(ratings_train['user_id']))
movies_train = ratings_train.pivot(index='user_id', columns='movie_id', values='rating')
movies_train = movies_train.fillna(0)
value = [0] * len(set(ratings_train['user_id']))
movies_train.insert(1581,'1582',value)
movies_train.insert(1652,'1653',value)
movies_train.head()


movies_test = ratings_test.pivot(index='user_id', columns='movie_id', values='rating')
movies_test_final = pd.DataFrame(index=range(1,944),columns=movies_train.columns)
movies_test_final = movies_test_final.fillna(0)
movies_test = movies_test.fillna(0)

for column in movies_test.columns:
    movies_test_final.ix[:,column] = movies_test.ix[:,column]
    
movies_test_final.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
# --- Start Item Based Recommendations --- #
#uses cosine similarity
# Drop all columns that do not describe the type of item.
#Create a placeholder dataframe listing item vs. item

movies_train.columns = movies['movie_title']
movie_similarity = pd.DataFrame(index=movies_train.columns,columns=movies_train.columns)
movies_train.head()

movie_title,Toy Story (1995),GoldenEye (1995),Four Rooms (1995),Get Shorty (1995),Copycat (1995),Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),Twelve Monkeys (1995),Babe (1995),Dead Man Walking (1995),Richard III (1995),...,Mirage (1995),Mamma Roma (1962),"Sunchaser, The (1996)","War at Home, The (1996)",Sweet Nothing (1995),Mat' i syn (1997),B. Monkey (1998),Sliding Doors (1998),You So Crazy (1994),Scream of Stone (Schrei aus Stein) (1991)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
##Using Sparse Matrices to calculate cosine similarity to conserve time
movies_train_matrix = pd.DataFrame.as_matrix(movies_train)
item_similarity = 1 - pairwise_distances(movies_train_matrix.T, metric='cosine')
item_similarity[:4,:4]

array([[ 1.        ,  0.40295926,  0.33326137,  0.45323958],
       [ 0.40295926,  1.        ,  0.2691851 ,  0.46563087],
       [ 0.33326137,  0.2691851 ,  1.        ,  0.29795624],
       [ 0.45323958,  0.46563087,  0.29795624,  1.        ]])

In [78]:
#With the similarity matrix filled out we can look for each items “neighbour” by looping through ‘movie_similarity’, 
#sorting each column in descending order,and grabbing the name of each of the top 10 movies.
# Create a placeholder items for closes neighbours to an item
item_similarity = pd.DataFrame(item_similarity)
movie_neighbours = pd.DataFrame(index=item_similarity.columns,columns=range(1,11))
movie_neighbours.head() 
# Loop through our similarity dataframe and fill in neighbouring item names
for i in range(0,len(movie_similarity.columns)):
    movie_neighbours.ix[i,:10] = item_similarity.ix[0:,i].sort_values(ascending=False)[:10].index

movie_neighbours.head()
#End of Item Based Filtering

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,0,49,180,120,173,404,236,221,99,150
1,1,160,232,575,402,225,549,61,28,567
2,2,762,409,249,41,234,239,66,32,474
3,3,95,173,55,194,402,215,201,185,78
4,4,217,53,664,218,52,233,37,225,97


In [79]:
#Making the names visible for Easy Interpretation
for i in range(0,len(movie_neighbours)):
    for j in range(1,len(movie_neighbours.columns)+1):
        movie_neighbours.ix[i,j] = movies['movie_title'][movie_neighbours.ix[i,j]]
movie_neighbours.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,Toy Story (1995),Star Wars (1977),Return of the Jedi (1983),Independence Day (ID4) (1996),Raiders of the Lost Ark (1981),Mission: Impossible (1996),Jerry Maguire (1996),Star Trek: First Contact (1996),Fargo (1996),Willy Wonka and the Chocolate Factory (1971)
1,GoldenEye (1995),Top Gun (1986),Under Siege (1992),Cliffhanger (1993),Batman (1989),Die Hard 2 (1990),Die Hard: With a Vengeance (1995),Stargate (1994),Batman Forever (1995),Speed (1994)
2,Four Rooms (1995),Happy Gilmore (1996),Kingpin (1996),"Fifth Element, The (1997)",Clerks (1994),Mars Attacks! (1996),Beavis and Butt-head Do America (1996),Ace Ventura: Pet Detective (1994),Desperado (1995),Trainspotting (1996)
3,Get Shorty (1995),Terminator 2: Judgment Day (1991),Raiders of the Lost Ark (1981),Pulp Fiction (1994),"Terminator, The (1984)",Batman (1989),When Harry Met Sally... (1989),Groundhog Day (1993),"Blues Brothers, The (1980)","Fugitive, The (1993)"
4,Copycat (1995),Cape Fear (1991),Outbreak (1995),Alien 3 (1992),"Nightmare on Elm Street, A (1984)",Natural Born Killers (1994),Jaws (1975),"Net, The (1995)",Die Hard 2 (1990),"Silence of the Lambs, The (1991)"


In [114]:
## User Based Collaborative Filtering. We need to now find the similarity #between users
user_similarity = 1 - pairwise_distances(movies_train_matrix, metric='cosine')
user_similarity[:4,:4]
#BY considering the top k users who are most similar to the input user (or, similarly, the top k items), 
#we can predict that a user u's rating for item i is given by the weighted sum of the top k user's 
#ratings for item i where the weighting is the cosine similarity between the each user and the input user u.

def predict_topk(ratings, similarity, kind='user', k=10):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in range(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in range(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in range(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in range(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred


array([[ 3.86503223,  1.94130736,  1.25774252, ...,  0.        ,
         0.07322098,  0.08261652],
       [ 1.97240341,  0.        ,  0.32923866, ...,  0.04200808,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.06153293,
         0.        ,  0.        ],
       ..., 
       [ 3.00356787,  0.06311371,  0.3586461 , ...,  0.        ,
         0.        ,  0.        ],
       [ 2.26808745,  0.80915019,  0.22891586, ...,  0.        ,
         0.        ,  0.        ],
       [ 3.02692118,  2.55283954,  1.54825295, ...,  0.        ,
         0.07335196,  0.        ]])

In [121]:
##Evaluating the Recommendations
def get_rmse(pred, actual):
    # We take only the nonzero values
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

user_pred = predict_topk(movies_train_matrix, user_similarity, kind='user', k=40)
print('User-based Collaborative Filtering RMSE: ' + str(get_rmse(user_pred, pd.DataFrame.as_matrix(movies_test_final))))

User-based Collaborative Filtering RMSE: 6.51090882164


In [122]:
item_pred = predict_topk(movies_train_matrix, item_similarity, kind='item', k=40)
print('Item-based Collaborative Filtering RMSE: ' + str(get_rmse(item_pred, pd.DataFrame.as_matrix(movies_test_final))))


Item-based Collaborative Filtering RMSE: 7.707950179
