In [5]:
import pandas as pd

#Load the u.user file into a dataframe
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('/Users/youssefeldeeb/Documents/Material/Recommendation-Systems/Datasets/movielens/u.user', sep='|', names=u_cols, encoding='latin-1')
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [8]:
#Load the u.items file into a dataframe
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL',
          'unknown', 'Action', 'Adventure','Animation', 'Children\'s', 'Comedy', 
          'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical',
          'Mystery', 'Romance', 'Sci-Fi', 'Thriller','War', 'Western']

movies = pd.read_csv('/Users/youssefeldeeb/Documents/Material/Recommendation-Systems/Datasets/movielens/u.item', sep='|', names=i_cols, encoding='latin-1')
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
#Remove all information except Movie ID and title
movies = movies[['movie_id', 'title']]
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [13]:
#Load the u.data file into a dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/Users/youssefeldeeb/Documents/Material/Recommendation-Systems/Datasets/movielens/u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings.head()


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [14]:
#Drop the timestamp column
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [17]:
#Training and test data
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
x = ratings.copy()
y = ratings['user_id']

#Split into training and test datasets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y, random_state=42)


In [18]:
#Evaluation
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
import numpy as np
def rmse (y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


In [19]:
#Define the baseline model to always return 3.
def baseline(user_id, movie_id):
    return 3.0

In [23]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for user,movie in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(x_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_pred,y_true)

In [24]:
score(baseline)

1.2470926188539486

## User-based collaborative filtering

In [26]:
#Build the ratings matrix using pivot_table function
r_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1673,1674,1675,1676,1679,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [27]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, movies_id):
    #Check if movie_id exists in r_matrix
    if movies_id in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[movies_id].mean()
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3
    return mean_rating

#Compute RMSE for the Mean model
score(cf_user_mean)

1.0234701463131335

In [37]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

# Import cosine_score
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosin_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

#Convert into pandas dataframe
cosin_sim = pd.DataFrame(cosin_sim, index=r_matrix.index, columns=r_matrix.index)

# r_matrix_dummy.head()
cosin_sim.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.118076,0.029097,0.011628,0.264677,0.312419,0.308729,0.224269,0.026017,0.286411,...,0.308475,0.055872,0.197862,0.131367,0.152449,0.084456,0.293293,0.056765,0.103536,0.326491
2,0.118076,1.0,0.099097,0.10768,0.034279,0.152789,0.086705,0.078864,0.06894,0.092399,...,0.086927,0.259636,0.289092,0.318824,0.149105,0.186347,0.168034,0.106748,0.136796,0.080358
3,0.029097,0.099097,1.0,0.252131,0.026893,0.062539,0.039767,0.089474,0.078162,0.03767,...,0.040918,0.019031,0.065417,0.055373,0.086503,0.018418,0.096993,0.109631,0.092574,0.018987
4,0.011628,0.10768,0.252131,1.0,0.0,0.045543,0.078812,0.095354,0.059498,0.053879,...,0.024226,0.050703,0.056561,0.107294,0.098892,0.0,0.1329,0.142798,0.097066,0.015176
5,0.264677,0.034279,0.026893,0.0,1.0,0.202843,0.299619,0.163724,0.038474,0.153021,...,0.262547,0.048524,0.048312,0.022202,0.09191,0.066,0.156172,0.115842,0.124297,0.267574


In [38]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, movie_id):
    #Check if movie_id exists in r_matrix
    if movie_id in r_matrix:
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosin_sim[user_id]

        #Get the user ratings for the movie in question
        m_ratings = r_matrix[movie_id]

        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index

        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()

        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)

        #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings)/sim_scores.sum()
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 3.0
    return wmean_rating

score(cf_user_wmean)            

1.0174483808407588