
<h1 align="center"><font size="5">COLLABORATIVE FILTERING ITEM-BASED</font></h1>

In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from surprise.model_selection import GridSearchCV
from surprise import Reader, Dataset, KNNBasic, SVD, NMF,KNNWithMeans
from surprise.model_selection import GridSearchCV, cross_validate
from sklearn.model_selection import train_test_split
import time as t


## Load data 

In [2]:

prepared_data = pd.read_csv('data_preprocessed.csv')

In [3]:

prepared_data.head()

Unnamed: 0.1,Unnamed: 0,userId,sceneId,rating
0,0,1,1,1
1,1,1,2,1
2,2,1,3,3
3,3,1,4,3
4,4,1,5,3


In [4]:
#Dropping columns  Unnamed
data = prepared_data.drop(['Unnamed: 0'], 1)
#data=original_data


<hr>

<a id="ref3"></a>
# Collaborative Filtering

### User-Item matrix

In [5]:

movie_matrix = data.pivot(index='userId', columns='sceneId',values='rating').fillna(0).astype(int)

In [6]:
movie_matrix


sceneId,1,2,3,4,5,6,7,8,9,10,11,12
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,1,3,3,3,1,4,4,3,3,2,3
2,1,1,3,3,4,0,5,4,4,3,3,2
3,1,1,3,3,3,1,4,4,3,3,2,3
4,1,1,3,3,3,1,4,5,4,4,2,3
5,1,1,3,2,3,1,4,5,3,3,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...
996,2,3,4,3,4,3,3,4,2,2,2,3
997,2,2,3,3,4,3,2,3,3,2,2,4
998,3,2,4,4,4,2,2,3,3,2,1,3
999,4,3,4,4,3,3,2,3,2,2,2,3


In [7]:
data.groupby("userId")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020DCB2F92B0>

In [8]:
#Movies not rated by user X
movie_not_rated={}
movie_not_rated_idexes={}
for i,row in movie_matrix.iterrows():
    combine=list(zip(row.index,row.values,row))
    idx_row=[(idx,col) for idx,val,col in combine if val==0]
    indixes=[i[1] for i in idx_row]
    row_names=[i[0] for i in idx_row]
    movie_not_rated_idexes[i]=indixes
    movie_not_rated[i]=row_names

### Select user x

In [9]:
selected_user=2

In [10]:
top_n_users=5

In [11]:
top_n_items=5

### Get unknow items

In [12]:

movies_to_predict=movie_not_rated.get(selected_user)


In [13]:
my_reccomendations = []

### Load data with surprise Bib

In [14]:
reader = Reader(rating_scale=(1, 5))

data_df = Dataset.load_from_df( data[['userId', 'sceneId', 'rating']], reader = reader )

### Use KNN Algo

In [15]:

sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNBasic(sim_options=sim_options)

In [16]:
knn_recommendations=[]
now=t.time()

In [17]:
algo.fit(data_df.build_full_trainset())


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x20dd055d310>

In [18]:
inner_id = algo.trainset.to_inner_iid(1)
neighbors = algo.get_neighbors(inner_id, k=10)
neighbors

[1, 5, 2, 3, 4, 11, 10, 7, 8, 9]

In [19]:
for movie_id in movies_to_predict:
    knn_recommendations.append((movie_id, algo.predict(uid=selected_user,iid=movie_id).est))
     
knn_rec=pd.DataFrame(knn_recommendations, columns=['sceneId', 'prediction score'])

In [20]:
print("Recommendations for user {0} using knn item-based :".format(selected_user))
knn_rec.sort_values('prediction score', ascending=False).head(top_n_items)


Recommendations for user 2 using knn item-based :


Unnamed: 0,sceneId,prediction score
0,6,2.671727


In [21]:
cross_validate(algo=algo, data=data_df, measures=['RMSE','MAE'], cv=5, verbose=True)
later=t.time()
print(later-now)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9630  0.9358  0.9732  0.9486  0.9424  0.9526  0.0137  
MAE (testset)     0.7471  0.7323  0.7512  0.7306  0.7372  0.7397  0.0082  
Fit time          0.02    0.02    0.02    0.02    0.01    0.01    0.00    
Test time         0.04    0.03    0.04    0.06    0.03    0.04    0.01    
0.38523316383361816


#### Using GridSearchCV to find best k parameter

In [22]:
n_neighbours = [5, 10, 15]
param_grid = {'n_neighbours' : n_neighbours}

gs = GridSearchCV(KNNBasic, measures=['RMSE'], param_grid=param_grid)
gs.fit(data_df)

print('Best Score :', gs.best_score['rmse'])
print('Best Parameters :', gs.best_params['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi