# Getting data

In [1]:
import pandas as pd
data = pd.read_csv("u.data", sep="\t", names = ["userId","movieId","rating","timestamp"])

In [2]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
data.shape

(100000, 4)

In [4]:
data

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [5]:
data = data.drop('timestamp',axis=1)

# Splitting the data

In [7]:
data.shape

(100000, 3)

In [8]:
train_data=data.iloc[:int(data.shape[0]*0.80)]
test_data=data.iloc[int(data.shape[0]*0.80):]

In [9]:
train_data.shape

(80000, 3)

In [10]:
test_data.shape

(20000, 3)

# SVD MF

In [11]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3096308 sha256=6dae9ecaab882c14807249d7aa146f8a2dce89c07109bf02c056dc4ba32c796d
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [12]:
from surprise import SVD
import numpy as np
import surprise
from surprise import Reader, Dataset

In [13]:
# train set from train data

In [14]:

reader = Reader(rating_scale=(1,5))

# creating the traindata from the dataframe...
train_data_mf = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

# building the trainset from traindata.., It is of dataset format from surprise library..
trainset = train_data_mf.build_full_trainset()

In [15]:
#test set from test data

In [16]:
reader = Reader(rating_scale=(1,5))

# creating the traindata from the dataframe...
test_data_mf = Dataset.load_from_df(test_data[['userId', 'movieId', 'rating']], reader)

# building the trainset from traindata.., It is of dataset format from surprise library..
testset = test_data_mf.build_full_trainset()

In [17]:
svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8070f7f550>

In [18]:
# storing train predictions

In [19]:
#getting predictions of trainset
train_preds = svd.test(trainset.build_testset())

train_pred_mf = np.array([pred.est for pred in train_preds])


In [20]:
# storing test predictions

In [21]:
#getting predictions of trainset
test_preds = svd.test(testset.build_testset())

test_pred_mf = np.array([pred.est for pred in test_preds])


In [22]:
data.shape

(100000, 3)

In [23]:
test_pred_mf

array([3.36535102, 2.72843633, 2.51747415, ..., 4.04687003, 3.65536936,
       3.24981723])

# XGBoost

# Preparing train data frame

In [27]:
!pip install scipy
from scipy import sparse
# Creating a sparse matrix
train_sparse_matrix = sparse.csr_matrix((train_data.rating.values, (train_data.userId.values,
                                               train_data.movieId.values)))



In [28]:
# Global avg of all movies by all users

In [29]:
train_averages = dict()
# get the global average of ratings in our train set.
train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average
train_averages

{'global': 3.5315375}

In [30]:
# geting the user averages in dictionary (key: user_id/movie_id, value: avg rating)

def get_average_ratings(sparse_matrix, of_users):

    # average ratings of user/axes
    ax = 1 if of_users else 0 # 1 - User axes,0 - Movie axes

    # ".A1" is for converting Column_Matrix to 1-D numpy array
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1

    is_rated = sparse_matrix!=0

    no_of_ratings = is_rated.sum(axis=ax).A1


    u,m = sparse_matrix.shape

    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i]
                                 for i in range(u if of_users else m)
                                    if no_of_ratings[i] !=0}


    return average_ratings

In [31]:
# Average ratings given by a user

In [32]:
train_averages['user'] = get_average_ratings(train_sparse_matrix, of_users=True)
print('\nAverage rating of user 10 :',train_averages['user'][10])


Average rating of user 10 : 4.192771084337349


In [33]:
# Average ratings given for a movie

In [34]:
train_averages['movie'] =  get_average_ratings(train_sparse_matrix, of_users=False)
print('\n AVerage rating of movie 15 :',train_averages['movie'][15])


 AVerage rating of movie 15 : 3.75


In [35]:
# geting users, movies and ratings from our samples train sparse matrix
train_users, train_movies, train_ratings = sparse.find(train_sparse_matrix)

In [36]:
from datetime import datetime

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
final_data = pd.DataFrame()
count = 0
start = datetime.now()
for (user, movie, rating)  in zip(train_users, train_movies, train_ratings):

            #--------------------- Ratings of "movie" by similar users of "user" ---------------------
            # compute the similar Users of the "user"
            user_sim = cosine_similarity(train_sparse_matrix[user], train_sparse_matrix).ravel()
            top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # get the ratings of most similar users for this movie
            top_ratings = train_sparse_matrix[top_sim_users, movie].toarray().ravel()
            # we will make it's length "5" by adding movie averages to .
            top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_users_ratings.extend([train_averages['movie'][movie]]*(5 - len(top_sim_users_ratings)))
        #     print(top_sim_users_ratings, end=" ")


            #--------------------- Ratings by "user"  to similar movies of "movie" ---------------------
            movie_sim = cosine_similarity(train_sparse_matrix[:,movie].T, train_sparse_matrix.T).ravel()
            top_sim_movies = movie_sim.argsort()[::-1][1:]
            top_ratings = train_sparse_matrix[user, top_sim_movies].toarray().ravel()
            top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_movies_ratings.extend([train_averages['user'][user]]*(5-len(top_sim_movies_ratings)))


            #-----------------prepare the row to be stores in a file-----------------#
            row = list()
            row.append(user)
            row.append(movie)
            # Now add the other features to this data...
            row.append(train_averages['global']) # first feature
            # next 5 features are similar_users "movie" ratings
            row.extend(top_sim_users_ratings)
            # next 5 features are "user" ratings for similar_movies
            row.extend(top_sim_movies_ratings)
            # Avg_user rating
            row.append(train_averages['user'][user])
            # Avg_movie rating
            row.append(train_averages['movie'][movie])

            # finalley, The actual Rating of this user-movie pair...
            row.append(rating)
            count = count + 1
            final_data = final_data.append([row])
            print(count)



            if (count)%10000 == 0:

                print("Done for {} rows----- {}".format(count, datetime.now() - start))


In [39]:
final_data.columns=['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
            'smr1', 'smr2', 'smr3', 'smr4', 'smr5', 'UAvg', 'MAvg', 'rating']

In [40]:
final_data.shape

(80000, 16)

In [41]:
final_data.head()

Unnamed: 0,user,movie,GAvg,sur1,sur2,sur3,sur4,sur5,smr1,smr2,smr3,smr4,smr5,UAvg,MAvg,rating
0,1,1,3.531538,3.0,4.0,5.0,4.0,5.0,5,5,4,3,4.0,3.59761,3.870423,5
0,2,1,3.531538,1.0,4.0,3.0,5.0,2.0,5,5,4,4,4.0,3.672414,3.870423,4
0,5,1,3.531538,5.0,4.0,3.0,5.0,3.0,4,5,4,3,3.0,2.871166,3.870423,4
0,6,1,3.531538,5.0,4.0,3.0,4.0,4.0,4,2,3,1,5.0,3.641414,3.870423,4
0,10,1,3.531538,4.0,4.0,3.0,3.0,4.0,5,5,4,4,4.0,4.192771,3.870423,4


In [42]:
final_data['mf_svd']=train_pred_mf

In [43]:
final_data.head()

Unnamed: 0,user,movie,GAvg,sur1,sur2,sur3,sur4,sur5,smr1,smr2,smr3,smr4,smr5,UAvg,MAvg,rating,mf_svd
0,1,1,3.531538,3.0,4.0,5.0,4.0,5.0,5,5,4,3,4.0,3.59761,3.870423,5,3.840841
0,2,1,3.531538,1.0,4.0,3.0,5.0,2.0,5,5,4,4,4.0,3.672414,3.870423,4,3.526333
0,5,1,3.531538,5.0,4.0,3.0,5.0,3.0,4,5,4,3,3.0,2.871166,3.870423,4,3.532837
0,6,1,3.531538,5.0,4.0,3.0,4.0,4.0,4,2,3,1,5.0,3.641414,3.870423,4,3.972718
0,10,1,3.531538,4.0,4.0,3.0,3.0,4.0,5,5,4,4,4.0,4.192771,3.870423,4,4.184632


# Preparing test data

In [44]:
# Creating a sparse matrix
test_sparse_matrix = sparse.csr_matrix((test_data.rating.values, (test_data.userId.values,
                                               test_data.movieId.values)))

In [45]:
# Global avg of all movies by all users

test_averages = dict()
# get the global average of ratings in our train set.
test_global_average = test_sparse_matrix.sum()/test_sparse_matrix.count_nonzero()
test_averages['global'] = test_global_average
test_averages

{'global': 3.52315}

In [46]:

# get the user averages in dictionary (key: user_id/movie_id, value: avg rating)

def get_average_ratings(sparse_matrix, of_users):

    # average ratings of user/axes
    ax = 1 if of_users else 0 # 1 - User axes,0 - Movie axes


    sum_of_ratings = sparse_matrix.sum(axis=ax).A1

    is_rated = sparse_matrix!=0

    no_of_ratings = is_rated.sum(axis=ax).A1


    u,m = sparse_matrix.shape

    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i]
                                 for i in range(u if of_users else m)
                                    if no_of_ratings[i] !=0}


    return average_ratings




In [47]:
# Average ratings given by a user

test_averages['user'] = get_average_ratings(test_sparse_matrix, of_users=True)


# Average ratings given for a movie

test_averages['movie'] =  get_average_ratings(test_sparse_matrix, of_users=False)
print('\n AVerage rating of movie 15 :',test_averages['movie'][15])


 AVerage rating of movie 15 : 3.9183673469387754


In [48]:
# get users, movies and ratings from our samples train sparse matrix
test_users, test_movies, test_ratings = sparse.find(test_sparse_matrix)

In [None]:
final_test_data = pd.DataFrame()
count = 0
st = datetime.now()
for (user, movie, rating)  in zip(test_users, test_movies, test_ratings):

            #--------------------- Ratings of "movie" by similar users of "user" ---------------------
            # compute the similar Users of the "user"
            user_sim = cosine_similarity(test_sparse_matrix[user], test_sparse_matrix).ravel()
            top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # get the ratings of most similar users for this movie
            top_ratings = test_sparse_matrix[top_sim_users, movie].toarray().ravel()
            # we will make it's length "5" by adding movie averages to .
            top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_users_ratings.extend([test_averages['movie'][movie]]*(5 - len(top_sim_users_ratings)))
        #     print(top_sim_users_ratings, end=" ")


            #--------------------- Ratings by "user"  to similar movies of "movie" ---------------------

            movie_sim = cosine_similarity(test_sparse_matrix[:,movie].T, test_sparse_matrix.T).ravel()
            top_sim_movies = movie_sim.argsort()[::-1][1:]
            top_ratings = test_sparse_matrix[user, top_sim_movies].toarray().ravel()

            top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_movies_ratings.extend([test_averages['user'][user]]*(5-len(top_sim_movies_ratings)))


            #-----------------prepare the row to be stores in a file-----------------#
            row = list()
            row.append(user)
            row.append(movie)
            # Now add the other features to this data...
            row.append(test_averages['global']) # first feature
            # next 5 features are similar_users "movie" ratings
            row.extend(top_sim_users_ratings)
            # next 5 features are "user" ratings for similar_movies
            row.extend(top_sim_movies_ratings)
            # Avg_user rating
            row.append(test_averages['user'][user])
            # Avg_movie rating
            row.append(test_averages['movie'][movie])

            # finalley, The actual Rating of this user-movie pair...
            row.append(rating)
            count = count + 1
            final_test_data = final_test_data.append([row])
            print(count)



            if (count)%10000 == 0:

                print("Done for {} rows----- {}".format(count, datetime.now() - start))


In [50]:
final_test_data.columns=['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
            'smr1', 'smr2', 'smr3', 'smr4', 'smr5', 'UAvg', 'MAvg', 'rating']

In [51]:
final_test_data.shape

(20000, 16)

In [52]:
test_pred_mf.shape

(20000,)

In [53]:
final_test_data['mf_svd']=test_pred_mf

# creating XGBoost

In [54]:
def get_error_metrics(y_true, y_pred):
    rmse = np.sqrt(np.mean([ (y_true[i] - y_pred[i])**2 for i in range(len(y_pred)) ]))
    mape = np.mean(np.abs( (y_true - y_pred)/y_true )) * 100
    return rmse, mape

In [55]:
# prepare train data
x_train = final_data.drop(['user', 'movie','rating'], axis=1)
y_train = final_data['rating']

In [56]:
# Prepare Test data
x_test = final_test_data.drop(['user','movie','rating'], axis=1)
y_test = final_test_data['rating']

In [57]:
import xgboost as xgb

In [58]:

# initialize XGBoost model...
xgb_model = xgb.XGBRegressor(silent=False, n_jobs=13, random_state=15, n_estimators=100)
# dictionaries for storing train and test results
train_results = dict()
test_results = dict()


# fit the model
print('Training the model..')
start =datetime.now()
xgb_model.fit(x_train, y_train, eval_metric = 'rmse')
print('Done. Time taken : {}\n'.format(datetime.now()-start))
print('Done \n')

Training the model..
Parameters: { "silent" } are not used.





Done. Time taken : 0:00:17.694252

Done 



In [59]:


# from the trained model, get the predictions....
print('Evaluating the model with TRAIN data...')
start =datetime.now()
y_train_pred = xgb_model.predict(x_train)
# get the rmse and mape of train data...
rmse_train, mape_train = get_error_metrics(y_train.values, y_train_pred)


train_results = {'rmse': rmse_train,
                    'mape' : mape_train,
                    'predictions' : y_train_pred}

Evaluating the model with TRAIN data...


In [60]:
train_results

{'rmse': 0.8170338677327665,
 'mape': 24.831875723645087,
 'predictions': array([4.1178036 , 4.116036  , 3.5187724 , ..., 3.1119797 , 0.92970383,
        2.5647666 ], dtype=float32)}

In [61]:

#######################################
# get the test data predictions and compute rmse and mape
print('Evaluating Test data')
y_test_pred = xgb_model.predict(x_test)
rmse_test, mape_test = get_error_metrics(y_true=y_test.values, y_pred=y_test_pred)

test_results = {'rmse': rmse_test,
                    'mape' : mape_test,
                    'predictions':y_test_pred}

Evaluating Test data


In [62]:
test_results

{'rmse': 0.9044797209851807,
 'mape': 27.43864494925737,
 'predictions': array([3.7922049, 4.0005317, 3.2309837, ..., 2.1274107, 2.4639502,
        3.0427215], dtype=float32)}