In [1]:
#Load libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
#Set the working directory
os.chdir("C:/Users/Aurangzeb Alam/Desktop/Projects/Recommendation-master/ml-100k")

In [3]:
#check the working directory
os.getcwd()

'C:\\Users\\Aurangzeb Alam\\Desktop\\Projects\\Recommendation-master\\ml-100k'

# Exploratory Data Analysis

In [4]:
#user column name is given in readme file so lets make its list and load the data.
users_columns = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=users_columns,encoding='latin-1')

In [5]:
#print shape of users
users.shape

(943, 5)

In [6]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
#rating column name is given in readme file so lets make its list and load the data.
rating_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=rating_columns,encoding='latin-1')

In [8]:
#print rating shape.
ratings.shape
#ratings.head()

(100000, 4)

In [9]:
#Head of the ratings
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [10]:
#item column name is given in readme file so lets make its list and load the data.
item_columns = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('u.item', sep='|', names=item_columns,encoding='latin-1')

In [11]:
#Head of the items
items.shape

(1682, 24)

In [12]:
#head of items.
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Split ratings into train and test

In [13]:
#rating columns
rating_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

#read rating train dataset
ratings_train = pd.read_csv('ua.base', sep='\t', names=rating_columns, encoding='latin-1')

#read test dataset
ratings_test = pd.read_csv('ua.test', sep='\t', names=rating_columns, encoding='latin-1')

In [14]:
#shape of rating train dataset.
ratings_train.shape

(90570, 4)

In [15]:
#shape of rating train dataset and rating test dataset
ratings_test.shape

(9430, 4)

In [16]:
#unique user_id by given user rating
n_users = ratings.user_id.unique().shape[0]
#unique movie id 
n_items = ratings.movie_id.unique().shape[0]

In [17]:
#create a item matrix to calculate the similarity between user and item
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    #array[user_id_index,movie_id_index]=rating
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [18]:
#Now calculate the pairwise distance using cosine similarity
from sklearn.metrics.pairwise import pairwise_distances

#pairwise user similarity
#cos theta = A.B/mod A mod B

user_similarity = pairwise_distances(data_matrix, metric='cosine')

item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [19]:
#This gives us the item to item and user to user similarity in an array form. The next step is to make predictions based on these similarities. 
#Let’s define a function to do so,
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [20]:
#finally i am making similarity using user similarity and item similarity
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [21]:
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ...,
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

In [22]:
item_prediction

array([[0.44627765, 0.475473  , 0.50593755, ..., 0.58815455, 0.5731069 ,
        0.56669645],
       [0.10854432, 0.13295661, 0.12558851, ..., 0.13445801, 0.13657587,
        0.13711081],
       [0.08568497, 0.09169006, 0.08764343, ..., 0.08465892, 0.08976784,
        0.09084451],
       ...,
       [0.03230047, 0.0450241 , 0.04292449, ..., 0.05302764, 0.0519099 ,
        0.05228033],
       [0.15777917, 0.17409459, 0.18900003, ..., 0.19979296, 0.19739388,
        0.20003117],
       [0.24767207, 0.24489212, 0.28263031, ..., 0.34410424, 0.33051406,
        0.33102478]])