## 1.Import Libraries

In [44]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.metrics import mean_squared_error
np.set_printoptions(threshold=np.inf)


### 2.Neighborhood-based Collaborative Filtering

### 2.1. Import the necessary files

#### 2.1.1. User

In [45]:
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
df_users = pd.read_csv('./data/u.user', sep='|', names=u_cols, encoding='latin-1')

In [46]:
df_users.head(5)

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [47]:
df_users.shape

(943, 5)

### 2.1.2. User_Item

In [48]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
df_ratings_train = pd.read_csv('./data/ub.base', sep='\t', names=r_cols, encoding='latin-1')
df_ratings_test = pd.read_csv('./data/ub.test', sep='\t', names=r_cols, encoding='latin-1')

In [49]:
df_ratings_train.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [50]:
df_ratings_train.shape

(90570, 4)

### 2.1.3. Item

In [51]:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

df_items = pd.read_csv('./data/u.item', sep='|', names=i_cols, encoding='latin-1')

In [52]:
df_items.head(5)

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [53]:
df_items.shape

(1682, 24)

### 2.2. Convert to matrix 

#### 2.2.1.Rate Train  Matrix and  Rate  Test Matrix

In [54]:
rate_train = df_ratings_train.values
rate_test = df_ratings_test.values 
# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

#### 2.2.2. User Matrix and Item Matrix

In [55]:
items=df_items.values
users=df_users.values

### 2.3.Built  KNN-User-User Model And KNN-Item-Item Model

KNN-User-User:
- First we have to standardize ub.base to the user-item matrix by finding the average rating of each column (user) then subtract the rating score of each column (user) in the user-item matrix. average corresponding to that user's column
- Next, divide into 3 types of values ​​as follows: if it is negative, it means that person does not like it, if it is positive, then that person likes it, and for missing values, we fill in 0, which means it is unclear.
- Then calculate the Cosine similarity of each user with each other, with each user being a vector containing the value of that user's rating points in the system.

KNN-Item-Item:
- First we have to normalize ub.base to the user-item matrix by finding the average rating of each row (item) and then subtract the rating score of each row (item) in the user-item matrix. average of corresponding rows of that item
- Next, divide into 3 types of values ​​as follows: if it is negative, it means that person does not like it, if it is positive, then that person likes it, and for missing values, we fill in 0, which means it is unclear.
- Then calculate the Cosine similarity of each item with each other, with each item being a vector containing the value of that item's rating points in the system.

In [56]:

class NBCF(object):
    def __init__(self, k, n_items,n_user,uuCF = 1, dist_f = cosine_similarity, limit = 10):
        self.uuCF = uuCF
        self.f = open('./result/result.dat', 'a+')
        self.Ybar = None
        self.k = k
        self.limit = limit
        self.dist_func = dist_f
        self.users_count = n_user if uuCF else n_items
        self.items_count = n_items if uuCF else n_user
        self.rate_t=None
    def normalizeY(self,rate_train):
        self.rate_t = rate_train if self.uuCF else rate_train[:,[1, 0, 2]]
        users = self.rate_t[:, 0]
        self.Ybar = self.rate_t.copy()
        self.mu = np.zeros((self.users_count,))
        for i in range(self.users_count):
            ids = np.where(users == i)[0].astype(int)
            ratings = self.rate_t[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[i] = m
            self.Ybar[ids, 2] = ratings - self.mu[i]
        self.Ybar = sparse.coo_matrix((self.Ybar[:, 2],
            (self.Ybar[:, 1], self.Ybar[:, 0])), (self.items_count, self.users_count))
        self.Ybar = self.Ybar.tocsr()
        
    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
    def fit(self,rate_train):
        self.normalizeY(rate_train)
        self.similarity()
    
    def pred(self, u, i, normalized = 1):
        ids = np.where(self.rate_t[:, 1] == i)[0].astype(int)
        if ids.size == 0:
            return 0
        users = (self.rate_t[ids, 0]).astype(int)
        sim = self.S[u, users]
        a = np.argsort(sim)[-self.k:]
        nearest = sim[a]
        r = self.Ybar[i, users[a]]
        if normalized:
            return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8)
        return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8) + self.mu[u]
          
    def _pred(self, u, i, normalized = 1):
        if self.uuCF: return self.pred(u, i, normalized)
        return self.pred(i, u, normalized)
    def predict_test(self,rating_test):
        predictions = np.zeros(rating_test.shape[0],)
        n_tests=rating_test.shape[0]
        for n in range(n_tests):
            pred = self._pred(rating_test[n, 0], rating_test[n, 1], normalized = 0)
            predictions[n]=pred
        return predictions
    def export_result(self,name_df, Data_test,RMSE):
        if self.uuCF==1:
            model="KNN-User-User"
        else:
            model= "KNN-Item-Item"
        if self.f.close:
            self.f = open('./result/result.dat', 'a+')
        self.f.write(f'Dataset Name:{str(name_df)},Model:{model},K:{self.k},Size:{Data_test.shape[0]},RMSE:{RMSE}\n')
        self.f.close()
    def RMSE(self,Data_test,y_predict):
        result=mean_squared_error(Data_test[:,2],y_predict)
        RMSE=np.sqrt(result)
        return RMSE
    def recommend(self, u):
        if self.uuCF:
            ids = np.where(self.rate_t[:, 0] == u)[0].astype(int)
            items_rated_by_user = self.rate_t[ids, 1].tolist()
            n = self.items_count
        else:
            ids = np.where(self.rate_t[:, 1] == u)[0].astype(int)
            items_rated_by_user = self.rate_t[ids, 0].tolist()
            n = self.users_count
        a = np.zeros((n,))
        recommended_items = []
        for i in range(n):
            if i not in items_rated_by_user:
                a[i] = self._pred(u, i)
        if len(a) < self.limit:
            recommended_items = np.argsort(a)[-len(a):]
        else:
            recommended_items = np.argsort(a)[-self.limit:]
        return recommended_items
        

#### 2.3.1. KNN-User-User Model

In [57]:
rs_uu = NBCF(k=30,n_items=items.shape[0],n_user=users.shape[0],uuCF=1)
rs_uu.fit(rate_train)


#### 2.3.3. Predicting  KNN-User-User Model

In [58]:
predict_u=rs_uu.predict_test(rate_test)

### 2.3.3 Evaluating KNN-User-User Model

In [59]:
rmse_uu=rs_uu.RMSE(rate_test,predict_u)
print(f"RMSE KNN-User-User:{rmse_uu}")
rs_uu.export_result("100K",rate_test,rmse_uu)

RMSE KNN-User-User:0.9963187578181227


#### 2.3.4. KNN-Item-Item Model

In [60]:
rs_ii=NBCF(k=30,n_items=items.shape[0],n_user=users.shape[0],uuCF=0)
rs_ii.fit(rate_train)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


#### 2.3.5. Predicting  KNN-Item-Item Model

In [61]:
predict_i=rs_ii.predict_test(rate_test)
predict_i

array([3.18767553, 3.79016328, 4.44418971, 3.01509027, 3.65518388,
       4.08223865, 3.92725186, 2.91519709, 4.08073565, 3.27271101,
       3.74298116, 3.24743614, 3.00287984, 3.2939914 , 3.7726665 ,
       3.38769347, 3.10070882, 4.19992589, 4.08714509, 4.16900706,
       2.65572743, 3.19639787, 4.15550088, 2.61023516, 2.43920249,
       3.10822908, 3.05085537, 3.14024292, 2.86570646, 2.37839355,
       4.11185534, 4.42962869, 3.92765434, 3.84732501, 4.05696584,
       3.88534912, 3.70031047, 3.35452463, 4.15462364, 4.00451451,
       3.34459644, 2.97722355, 2.85014799, 3.86578277, 2.70171772,
       2.66185947, 2.1620583 , 3.35712839, 2.36218422, 3.32921664,
       3.76339421, 3.98461086, 3.51978882, 4.0305963 , 2.86970338,
       4.19769275, 3.66153787, 4.03838959, 3.41154491, 3.32611339,
       4.0889349 , 4.26065517, 4.10722383, 3.43975867, 3.10987331,
       3.90868673, 3.29578413, 3.25573193, 3.9804404 , 3.75740599,
       4.18987924, 4.1661957 , 4.11331056, 4.28214817, 4.19573

#### 2.3.6 Evaluating KNN-Item-Item Model

In [62]:
rmse_ii=rs_ii.RMSE(rate_test,predict_i)
print(f"RMSE KNN-Item-Item:{rmse_ii}")
rs_ii.export_result("100K",rate_test,rmse_ii)

RMSE KNN-Item-Item:0.9867912132705395
