## 1.Import Libraries

In [49]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import ipynb.fs.full.Neighborhood_based as nb
np.set_printoptions(threshold=np.inf)


## 2. Content-based Filtering

### 2.1. Import the necessary files

#### 2.1.1 User

In [50]:
# df_user
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
df_users = pd.read_csv('./data/u.user', sep='|', names=u_cols, encoding='latin-1')

In [51]:
df_users.head(5)

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [52]:
df_users.shape

(943, 5)

### 2.1.2 User_Item

In [53]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
df_ratings_train = pd.read_csv('./data/ua.base', sep='\t', names=r_cols, encoding='latin-1')
df_ratings_test = pd.read_csv('./data/ua.test', sep='\t', names=r_cols, encoding='latin-1')

In [54]:
df_ratings_train.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [55]:
df_ratings_train.shape

(90570, 4)

### 2.1.3 Item

In [56]:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

df_items = pd.read_csv('./data/u.item', sep='|', names=i_cols, encoding='latin-1')

In [57]:
df_items.head(5)

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [58]:
df_items.shape

(1682, 24)

### 2.2 Convert to matrix 

#### 2.2.1.Rate Train  Matrix and  Rate  Test Matrix

In [59]:

rate_train = df_ratings_train.values
rate_test = df_ratings_test.values 
print(rate_test.shape)

(9430, 4)


### 2.2.2. Movie Genre Matrix

In [60]:
X= df_items.values
X_Item_Profile = X[:, 6:] ### chỉ lấy các cột thể loại

### 2.3 Build Filter Contend Based

#### 2.3.1. Build Item Profiles

In [61]:
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf=transformer.fit_transform(X_Item_Profile.tolist()).toarray()
 

In [62]:
print(tfidf[0:2,:])

[[0.         0.         0.74066017 0.57387209 0.34941857 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.53676706 0.65097024 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.53676706 0.         0.        ]]


#### 2.3.2 Learn Models For Each User

- Find the Item ID and its rating

In [63]:
def get_items_rated_by_user(rate_matrix, user_id):
    y = rate_matrix[:,0]
    ids = np.where(y == user_id +1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

**Use ridge regression model to find out weights and deviations**:
   - Input: TF-IDF matrix of Items that have been rated by that user.
   - Label: Rating score of each user.
   - Ridge regression model is used to learn the weights and biases.
   
**Building Linear Regression to Predict Ratings**:
   - Predicted Rating Matrix = tfidf matrix x Weight matrix + bias matrix.
   - This step utilizes the parameters (weights and bias) learned from the Ridge regression model to build a linear regression model for predicting ratings.



In [64]:
class ContentBasedFiltering:
    def __init__(self, tfidf, n_users, n_items, lamda):
        self.lamda = lamda
        self.tfidf = tfidf
        self.f = open('./result/result.dat', 'a+')
        self.n_users = n_users
        self.n_items = n_items
    def fit(self,rate_train):
        d =self.tfidf.shape[1] 
        W = np.zeros((d, self.n_users))
        b = np.zeros((1, self.n_users))
        for n in range(self.n_users):    
            ids, scores = get_items_rated_by_user(rate_train, n)
            clf = Ridge(alpha= self.lamda, fit_intercept  = True)
            Xhat = self.tfidf[ids, :]
            clf.fit(Xhat, scores) 
            W[:, n] = clf.coef_
            b[0, n] = clf.intercept_
        self.Yhat = abs(self.tfidf.dot(W) + b)
    def predict_one_sample(self,user_id,rate_test):
        movie_ids, scores = get_items_rated_by_user(rate_test, user_id)
        id_item=(movie_ids+1).tolist()
        actual_ratting= scores.tolist()
        predict_ratting=self.Yhat[movie_ids, user_id]
        return id_item,actual_ratting,predict_ratting
    def predict(self, rate_test):
        predict_all = []
        for n in range(self.n_users):
            ids, score = get_items_rated_by_user(rate_test, n)
            scores_pred = self.Yhat[ids, n]
            predict_all.append(scores_pred)
        return np.array(predict_all)
    def export_result(self,name_df,actual,RMSE):
        self.f.write(f'Dataset Name:{str(name_df)},Model:{"ridge regression"},Size:{actual.shape[0]},RMSE:{RMSE}\n')
        self.f.close()
    def RMSE(self,actual,predict):
        actual_all = []
        for n in range(self.n_users):
            ids, scores_truth = get_items_rated_by_user(actual, n)
            actual_all.append(scores_truth)
        actual_all=np.array(actual_all)
        RMSE=np.sqrt(mean_squared_error(actual_all,predict))
        return RMSE
        

In [65]:
n_users = df_users.shape[0]
n_items =df_items.shape[0]

#### 2.3.3.The ContendBased Algorithm

In [66]:
cb =ContentBasedFiltering(tfidf, n_users,n_items, lamda=7)
cb.fit(rate_train)

#### 2.3.4. Predicting a sample

In [67]:

movies_id,actual_ratting,predict_ratting = cb.predict_one_sample(0,rate_test)# user1
print('Rated movies ids :', movies_id)
print('True ratings     :', actual_ratting)
print('Predicted ratings:', predict_ratting)


Rated movies ids : [20, 33, 61, 117, 155, 160, 171, 189, 202, 265]
True ratings     : [4, 4, 4, 3, 2, 4, 5, 3, 5, 4]
Predicted ratings: [4.05258587 3.49334641 4.0876576  2.99192374 3.58526105 4.0876576
 4.1537379  3.71704288 3.75937954 3.29926366]


#### 2.4.5. Predicting All 

In [68]:
predict_rating=cb.predict(rate_test)
print(predict_rating.shape)

(943, 10)


#### 2.3.5 Evaluating 

In [69]:
rmse=cb.RMSE(rate_test,predict_rating)
print(f"RMSE:{rmse}")
cb.export_result("100K",rate_test,rmse)

RMSE:1.0245214253474164


#### 2.3.6 Recommend

In [70]:
def recommend(user_id,n_items,predict_rate,rate_test, top):
        a = np.zeros((n_items,))
        recommended_items = []
        items_rated_by_user, score = get_items_rated_by_user(rate_test, user_id)
        for i in range(n_items):
            if i not in items_rated_by_user:
                a[i] = predict_rate[i, user_id]
        if len(a) < top:
            recommended_items = np.argsort(a)[-len(a):]
        else:
            recommended_items = np.argsort(a)[-top:]
        return recommended_items+1
    
recommend_movies = recommend(0, cb.n_items, cb.Yhat, rate_test, 10)
filtered_movies = df_items[df_items['movie id'].isin(recommend_movies)]
for index, row in filtered_movies.iterrows():
    print(f"Index: {index}, Title: {row['movie title']}")

Index: 6, Title: Twelve Monkeys (1995)
Index: 178, Title: Clockwork Orange, A (1971)
Index: 257, Title: Contact (1997)
Index: 269, Title: Gattaca (1997)
Index: 428, Title: Day the Earth Stood Still, The (1951)
Index: 433, Title: Forbidden Planet (1956)
Index: 551, Title: Species (1995)
Index: 759, Title: Screamers (1995)
Index: 1005, Title: Until the End of the World (Bis ans Ende der Welt) (1991)
Index: 1153, Title: Alphaville (1965)


### 1M

In [71]:
"""u_cols =  ['user_id', 'sex', 'age', 'occupation', 'zip_code']
users = pd.read_csv('ml-1m/users.dat', sep='::', names=u_cols,
 encoding='latin-1')
n_users = users.shape[0]

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('mvl/1M_train_03.dat', sep=':', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('mvl/1M_test_03.dat', sep=':', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

i_cols = ['movie id', 'title' ,'year', 'gen']
n_items = 3951

items = pd.read_csv('ml-1m/movies.dat', sep='::', names=i_cols, encoding='latin-1')
X = items.as_matrix()
X_train_count = np.full(shape = (n_items, 19), fill_value = 0)

genresList = [
  "Action",
  "Adventure",
  "Animation",
  "Children",
  "Comedy",
  "Crime",
  "Documentary",
  "Drama",
  "Fantasy",
  "Film-Noir",
  "Horror",
  "Musical",
  "Mystery",
  "Romance",
  "Sci-Fi",
  "Thriller",
  "War",
  "Western",
  "(no genres listed)"
]

def setGenresMatrix(genres):
    movieGenresMatrix = []
    movieGenresList = genres.split('|')
    for x in genresList:
        if (x in movieGenresList):
            movieGenresMatrix.append(1)
        else:
            movieGenresMatrix.append(0)
    return movieGenresMatrix
for i in range(n_items):
    X_train_count[i] = setGenresMatrix(X[i+1, 3])
    """

'u_cols =  [\'user_id\', \'sex\', \'age\', \'occupation\', \'zip_code\']\nusers = pd.read_csv(\'ml-1m/users.dat\', sep=\'::\', names=u_cols,\n encoding=\'latin-1\')\nn_users = users.shape[0]\n\nr_cols = [\'user_id\', \'movie_id\', \'rating\', \'unix_timestamp\']\n\nratings_base = pd.read_csv(\'mvl/1M_train_03.dat\', sep=\':\', names=r_cols, encoding=\'latin-1\')\nratings_test = pd.read_csv(\'mvl/1M_test_03.dat\', sep=\':\', names=r_cols, encoding=\'latin-1\')\n\nrate_train = ratings_base.as_matrix()\nrate_test = ratings_test.as_matrix()\n\ni_cols = [\'movie id\', \'title\' ,\'year\', \'gen\']\nn_items = 3951\n\nitems = pd.read_csv(\'ml-1m/movies.dat\', sep=\'::\', names=i_cols, encoding=\'latin-1\')\nX = items.as_matrix()\nX_train_count = np.full(shape = (n_items, 19), fill_value = 0)\n\ngenresList = [\n  "Action",\n  "Adventure",\n  "Animation",\n  "Children",\n  "Comedy",\n  "Crime",\n  "Documentary",\n  "Drama",\n  "Fantasy",\n  "Film-Noir",\n  "Horror",\n  "Musical",\n  "Mystery",\

In [72]:
"""cb2 = Contentbased(rate_train, X_train_count, n_users= n_users, n_items = n_items, lamda=98)
cb2.fit()
cb2.RMSE(Data_test=rate_test)
"""

'cb2 = Contentbased(rate_train, X_train_count, n_users= n_users, n_items = n_items, lamda=98)\ncb2.fit()\ncb2.RMSE(Data_test=rate_test)\n'