In [29]:
# User model based Collaborative Filtering Recommendation System

import numpy as np
import pandas as pd
import datetime

In [30]:
# Creating a dictionary of the mean of the ratings of each user
def find_mean(data):
    u_mean = {}
    for u_row in range(len(data.index)):
        user_id_for_mean = data.index[u_row]
        mean = pd.Series(data.ix[user_id_for_mean]).values
        mean = mean[~np.isnan(mean)].mean()
        u_mean[user_id_for_mean] = mean
    return u_mean


In [39]:
# Calculating Cosine similarity to find out the similar users

def cosine_similarity(u1, u2):
    u1 = np.asarray(u1)
    u2 = np.asarray(u2)
    return np.sum(u1 * u2) / np.sqrt(np.sum(u1 ** 2) * np.sum(u2 ** 2))


# Creating a dictionary of users with their corresponding similar users and their cosine similarity values.

def get_similarity(data):
    similarity = {}
    for row in range(len(data.index)):
        user_id = data.index[row]
        reviews = {}
        for x in range(len(data.index)):
            uid = data.index[x]
            if uid == user_id:
                continue
            if user_id in similarity:
                if uid in similarity[user_id]:
                    reviews[uid] = similarity[user_id][uid]
                    continue
            correlation = cosine_similarity(pd.Series(data.ix[uid]).values, pd.Series(data.ix[user_id]).values)
            reviews[uid] = correlation
            if uid in similarity:
                temp = similarity[uid]
                if bool(temp):
                    temp[user_id] = reviews[uid]
                    similarity[uid] = temp
            else:
                temp = {}
                temp[user_id] = reviews[uid]
                similarity[uid] = temp

        reviews = sorted(reviews.iteritems(), key=lambda (k, v): (v, k), reverse=True)
        similarity[user_id] = reviews
    return similarity


In [32]:
# Changing the training data pivot table - (actual value - mean) and insert into a new pivot table
def alter_training_data_for_similarity(data, mean):
    for index in range(len(data.index)):
        user_id_for_mean = data.index[index]
        movies_for_mean_uid = pd.Series(data.ix[user_id_for_mean]).values
        movies_for_mean_uid = [0 if np.isnan(x) else x - mean[user_id_for_mean] for x in movies_for_mean_uid]
        data.loc[user_id_for_mean] = pd.Series(movies_for_mean_uid).values
    return data

In [33]:
# Rating Prediction for movie 'm_id' by user 'u_id'

def predict_rating(u_id, m_id):
    global training_data
    global similarity
    user_list = similarity[u_id]
    k_users = 10 # numbers of top similar users
    numerator = 0.0
    denominator = 0.0
    for index in range(k_users):
        user_id_for_correlation = user_list[index][0]
        cor = user_list[index][1]
        for m_id_tuple in training_data.columns.tolist():
            if m_id_tuple[1] == m_id:
                r = training_data.ix[user_id_for_correlation].loc[:, m_id].values[0]
                break
            else:
                r = 0
                continue

        rating = 0 if np.isnan(r) else r
        numerator = numerator + (rating * cor)
        denominator = denominator + cor
        prediction = numerator / denominator
        if np.isnan(prediction) or prediction <= 1:
            return 1
        else:
            return int(round(prediction))

In [34]:
# Recommending movies to the user:
## First predict the rating of all movies which are NaN in training data by each user and then recommend the k moies 
### whose predicted rating is greater than or equal to the 80% of the mean value.

def recommendedMovies(trainingData, movies_data_1, predicted_rating, user_rating_mean, number_of_recommendation):
    user_rating_prediction = {}
    for user in predicted_rating:
        mean = user_rating_mean[user]
        movie_rating_prediction = {}
        for m in trainingData.columns.tolist():
            movie = m[1]
            if np.isnan(trainingData.ix[user].loc[:,movie].values[0]):
                movie_prediction = predict_rating(user, movie)
                if movie_prediction >= (mean*0.8):
                    movie_filter = movies_data_1[movies_data_1['movie_id'] == movie]
                    movie_name = movie_filter.get_value(movie_filter.index[0], 'movie_name')
                    movie_rating_prediction[movie] = movie_name
                if len(movie_rating_prediction) == number_of_recommendation:
                    break
        user_rating_prediction[user] = movie_rating_prediction
    return user_rating_prediction


In [35]:
# Finding RMSE value
def find_root_mean_square(data, predicted_ratings):
    root_mean_square = []
    for u_id_r in data.index:
        for m_id_c in data.columns.tolist():
            observed = data.ix[u_id_r].loc[:, m_id_c[1]].values[0]
            if m_id_c[1] in predicted_user_rating_for_test_data[u_id_r].keys():
                prediction = predicted_ratings[u_id_r][m_id_c[1]]
                root_mean_square.append((observed - prediction) ** 2)
    n = len(root_mean_square)
    root_mean_square = [rms for rms in root_mean_square if np.isfinite(rms)]
    return np.sqrt(np.sum(root_mean_square) / n)

In [36]:
# Input Files:
## movies.dat - It contains information of movies which has header - Movie_id, Movie_name and genre. 
### This file has total 3883 records
## ratings.dat - It Contains information of ratings given by users to the movies, headers - User_id, Movie_id, rating, timestamp
### This file has 900,188 records

### Ratings file has been shuffled first and then splitted into two files:
### training-data(80% of total records) and test-data(200 records)


ratings_header = ['user_id', 'movie_id', 'rating', 'timestamp']
movies_header = ['movie_id', 'movie_name', 'genre']

In [37]:
# Reading movies.dat and ratings_train.dat file and create a pivot_table of training data.
# Also, drop the timestamp column from ratings file because it's of no use in the computation.

movies_data = pd.read_csv('movies.dat', sep='::', names=movies_header, engine='python')
ratings = pd.read_csv('ratings_train.dat', sep='::', names=ratings_header, engine='python')
ratings.drop(['timestamp'], axis=1, inplace=True)

training_data = pd.pivot_table(ratings, index=['user_id'], columns=['movie_id'], values=['rating'])


In [40]:
user_mean = find_mean(training_data)

training_mean_data = alter_training_data_for_similarity(training_data.copy(), user_mean)

similarity = get_similarity(training_mean_data)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """


In [41]:
# Reading test data and droping the column timestamp. Creating a pivot_table of the data.
# This test data has 200 records and the format is:
## user_id, movie_id, rating, timestamp
## By using this test data we will predict the rating of each movie by user and then find the root mean square error.

test_data = pd.read_csv('ratings_test1.dat', sep="::", names=ratings_header, engine='python')
test_data.drop(['timestamp'], axis=1, inplace=True)
test_data = test_data.pivot_table(index=['user_id'], columns=['movie_id'], values=['rating'])


In [43]:
# Finding Root Mean Square Error against test data
predicted_user_rating_for_test_data = {}
for user_id_row in test_data.index:
    predicted_movie_rating = {}
    for movie_id_column_tuple in test_data.columns.tolist():
        movie_id_column = movie_id_column_tuple[1]
        if np.isnan(test_data.ix[user_id_row].loc[:, movie_id_column].values[0]):
            continue
        predicted_movie_rating[movie_id_column] = predict_rating(user_id_row, movie_id_column)
    predicted_user_rating_for_test_data[user_id_row] = predicted_movie_rating

rmse = find_root_mean_square(test_data, predicted_user_rating_for_test_data)
print 'Root Mean Square Error = ',rmse

Root Mean Square Error =  2.62488094968


In [44]:
# Recommending and printing movies for users
recommended_movies = recommendedMovies(training_data, movies_data, predicted_user_rating_for_test_data, user_mean, 10)

In [45]:
for user_recommendation in recommended_movies:
    print '(', datetime.datetime.now(), ') For User - ', user_recommendation
    print '\n' , str(recommended_movies[user_recommendation]) , '\n'


( 2017-11-12 18:23:08.625000 ) For User -  5636

{96L: 'In the Bleak Midwinter (1995)', 482L: 'Killing Zoe (1994)', 36L: 'Dead Man Walking (1995)', 300L: 'Quiz Show (1994)', 337L: "What's Eating Gilbert Grape (1993)", 306L: 'Three Colors: Red (1994)', 595L: 'Beauty and the Beast (1991)', 246L: 'Hoop Dreams (1994)', 537L: 'Sirens (1994)', 25L: 'Leaving Las Vegas (1995)'} 

( 2017-11-12 18:23:08.848000 ) For User -  3589

{32L: 'Twelve Monkeys (1995)', 2L: 'Jumanji (1995)', 43L: 'Restoration (1995)', 7L: 'Sabrina (1995)', 11L: 'American President, The (1995)', 78L: 'Crossing Guard, The (1995)', 47L: 'Seven (Se7en) (1995)', 20L: 'Money Train (1995)', 23L: 'Assassins (1995)', 39L: 'Clueless (1995)'} 

( 2017-11-12 18:23:08.850000 ) For User -  4618

{2528L: "Logan's Run (1976)", 1028L: 'Mary Poppins (1964)', 2188L: '54 (1998)', 1965L: 'Repo Man (1984)', 1584L: 'Contact (1997)', 1653L: 'Gattaca (1997)', 2133L: 'Adventures in Babysitting (1987)', 2010L: 'Metropolis (1926)', 2046L: 'Flight of

{356L: 'Forrest Gump (1994)', 1127L: 'Abyss, The (1989)', 1037L: 'Lawnmower Man, The (1992)', 750L: 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)', 593L: 'Silence of the Lambs, The (1991)', 21L: 'Get Shorty (1995)', 344L: 'Ace Ventura: Pet Detective (1994)', 260L: 'Star Wars: Episode IV - A New Hope (1977)', 1246L: 'Dead Poets Society (1989)', 1215L: 'Army of Darkness (1993)'} 

( 2017-11-12 18:23:09.236000 ) For User -  4279

{507L: 'Perfect World, A (1993)', 6L: 'Heat (1995)', 423L: 'Blown Away (1994)', 168L: 'First Knight (1995)', 653L: 'Dragonheart (1996)', 366L: "Wes Craven's New Nightmare (1994)", 367L: 'Mask, The (1994)', 504L: 'No Escape (1994)', 111L: 'Taxi Driver (1976)', 220L: 'Castle Freak (1995)'} 

( 2017-11-12 18:23:09.238000 ) For User -  698

{36L: 'Dead Man Walking (1995)', 70L: 'From Dusk Till Dawn (1996)', 10L: 'GoldenEye (1995)', 45L: 'To Die For (1995)', 47L: 'Seven (Se7en) (1995)', 17L: 'Sense and Sensibility (1995)', 50L: 'Usual Su