
<h1 align="center"><font size="5">COLLABORATIVE FILTERING USER-BASED</font></h1>

In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from surprise import Reader, Dataset
from surprise import SVD
from surprise.model_selection import GridSearchCV
from surprise import Reader, Dataset, KNNBasic, SVD
from surprise.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import NearestNeighbors
from scipy.stats import pearsonr
import time as t


## Load data 

In [2]:

original_data = pd.read_csv('data_preprocessed.csv')

In [3]:

original_data.head()

Unnamed: 0.1,Unnamed: 0,userId,sceneId,rating
0,0,1,1,2
1,1,1,2,1
2,2,1,3,1
3,3,1,4,2
4,4,1,5,2


In [4]:
#Dropping columns  Unnamed
data = original_data.drop(['Unnamed: 0'], 1)
#data=original_data


<hr>

<a id="ref3"></a>
# Collaborative Filtering

### User-Item matrix

In [5]:

movie_matrix = data.pivot(index='userId', columns='sceneId',values='rating').fillna(0).astype(int)

In [6]:
movie_matrix


sceneId,1,2,3,4,5,6,7,8,9,10,11,12
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2,1,1,2,2,1,2,2,3,4,4,4
2,3,1,0,1,2,0,3,1,3,4,5,3
3,3,1,1,1,2,1,2,2,3,3,4,4
4,2,2,1,1,1,0,2,2,4,4,4,5
5,2,2,1,1,1,1,2,2,3,4,4,4
6,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,0,0
8,0,0,0,1,0,1,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,1,0,0
10,1,0,0,0,1,0,0,0,0,0,0,0


In [7]:
data.groupby("userId")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000024BE48DFA90>

### Select a user x

In [8]:
selected_user=2

In [9]:
top_n_users=5

In [10]:
top_n_items=5

In [11]:
now=t.time()
inputMovies=data.groupby('userId').get_group(selected_user)

### The users who has seen the same movies as user x



In [12]:
userSubset = data[data['sceneId'].isin(inputMovies['sceneId'].tolist())]

In [13]:

userSubsetGroup = userSubset.groupby(['userId'])

In [14]:
#sort group by number of movies in common
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

### Compute similarity users to input user using Pearson Correlation


In [15]:
pearsonCorrelationDict = {}

for name, group in userSubsetGroup:
    group = group.sort_values(by='sceneId')
    inputMovies = inputMovies.sort_values(by='sceneId')
    nRatings = len(group)
   
    temp_df = inputMovies[inputMovies['sceneId'].isin(group['sceneId'].tolist())]
    
    tempRating = temp_df['rating'].tolist()
   
    tempGroup = group['rating'].tolist()
    
    #calculate the pearson correlation 
    Syy = sum([i**2 for i in tempGroup]) - pow(sum(tempGroup),2)/float(nRatings)
    Sxx = sum([i**2 for i in tempRating]) - pow(sum(tempRating),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRating, tempGroup)) - sum(tempRating)*sum(tempGroup)/float(nRatings)
   
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [16]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))


### The top x similar users to input user


In [17]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[1:top_n_users]
topUsers

Unnamed: 0,similarityIndex,userId
2,0.881358,3
0,0.860474,1
4,0.82534,5
3,0.785409,4


In [18]:
#merge rating_df with top users
topUsersRating=topUsers.merge(data, left_on='userId', right_on='userId', how='inner')


## Compute similarity score

multiply the movie rating by its weight (The similarity index)
,then sum up the new ratings and divide it by the sum of the weights.

In [19]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating=topUsersRating[topUsersRating['similarityIndex']>0]

In [20]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('sceneId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
#tempTopUsersRating.head()

In [21]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['prediction score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['sceneId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,prediction score,sceneId
sceneId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.262889,1
2,1.48045,2
3,1.0,3
4,1.25666,4
5,1.51955,5


Sort and see the top x movies that the algorithm recommended

In [22]:
recommendation_df = recommendation_df.sort_values(by='prediction score', ascending=False)
recommendation_df

Unnamed: 0_level_0,prediction score,sceneId
sceneId,Unnamed: 1_level_1,Unnamed: 2_level_1
12,4.23427,12
11,4.0,11
10,3.737111,10
9,3.23427,9
1,2.262889,1
7,2.0,7
8,2.0,8
5,1.51955,5
2,1.48045,2
4,1.25666,4


In [23]:
#Movies not rated by user X
movie_not_rated={}
movie_not_rated_idexes={}
for i,row in movie_matrix.iterrows():
    combine=list(zip(row.index,row.values,row))
    idx_row=[(idx,col) for idx,val,col in combine if val==0]
    indixes=[i[1] for i in idx_row]
    row_names=[i[0] for i in idx_row]
    movie_not_rated_idexes[i]=indixes
    movie_not_rated[i]=row_names


In [24]:

final_recommendation=recommendation_df.loc[recommendation_df['sceneId'].isin(movie_not_rated.get(selected_user))].sort_values(by='prediction score', ascending=False)
print("Recommendations for user {0} using user-based :".format(selected_user))
final_recommendation.head(top_n_items)
later=t.time()


Recommendations for user 2 using user-based :


In [25]:
final_recommendation.head()

Unnamed: 0_level_0,prediction score,sceneId
sceneId,Unnamed: 1_level_1,Unnamed: 2_level_1
3,1.0,3
6,0.76573,6


In [26]:
sim_options = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between items
}
algo = KNNBasic(sim_options=sim_options)

In [27]:
reader = Reader(rating_scale=(1, 5))

data_df = Dataset.load_from_df( data[['userId', 'sceneId', 'rating']], reader = reader )

In [28]:
cross_validate(algo=algo, data=data_df, measures=['RMSE','MAE'], cv=5, verbose=True)
later=t.time()
print(later-now)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2140  1.4461  1.4447  1.3805  1.2599  1.3490  0.0957  
MAE (testset)     1.0343  1.2177  1.2467  1.1881  1.0544  1.1483  0.0871  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
0.3165409564971924


  sim = construction_func[name](*args)
