### Import Libraries

In [29]:
import pandas as pd
import numpy as np

### Holding Data

In [30]:
user_cols = ['id', 'age', 'sex', 'occupation', 'zip_code']
user_pd = pd.read_csv('../Dataset/ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')
user_pd.shape

(943, 5)

In [31]:
rate_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
rating_base_pd = pd.read_csv('../Dataset/ml-100k/ua.base', sep='\t', names=rate_cols, encoding='latin-1')
rating_test_pd = pd.read_csv('../Dataset/ml-100k/ua.test', sep='\t', names=rate_cols, encoding='latin-1')
rating_base_pd.shape, rating_test_pd.shape

((90570, 4), (9430, 4))

In [None]:
index_cols = ['movie id', 'movie title', 'released date', 'video release date', 'IMDb URL', 'unknown',
              'Action', 'Adventure', 'Amination', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
              'Fantasty', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
              'Western']
items_pd = pd.read_csv('../Dataset/ml-100k/u.item', sep='|', names=index_cols, encoding='latin-1')
items_pd.shape

(1682, 24)

In [33]:
X0 = items_pd.values
X_train_counts = X0[:, -19:]
X_train_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

### TF-IDF ALGORITHMS (TERM FREQUENCY - INVERSE DOCUMENT FREQUENCY)

[References]

- [Understanding TF-IDF (Term Frequency-Inverse Document Frequency)](https://www.geeksforgeeks.org/machine-learning/understanding-tf-idf-term-frequency-inverse-document-frequency/)


In [None]:
def parse_tf_idf():
    pass

In [None]:
#tfidf
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()
tfidf

(array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.53676706, 0.65097024, ..., 0.53676706, 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 (1682, 19))

### Get items which were reated by one user

In [75]:
def get_items_rated_by_user(rating_matrix, user_id):
    #refer ua.base
    users = rating_matrix[:, 0]
    index_of_rated_items_by_user_id = np.where(users == user_id)
    result = rating_matrix[index_of_rated_items_by_user_id, 1:3][0]
    #return item ids and rated scores
    return (result[:, 0] - 1, result[:, 1])

### Modalization Users

In [74]:
n_users = user_pd.shape[0]
d = tfidf.shape[1] #data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

In [80]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

for user_index in range (n_users):
    user_id = user_index + 1
    
    items_ids, rated_scores = get_items_rated_by_user(rating_base_pd.values, user_id)

    clf = Ridge(alpha=0.01, fit_intercept=True)
    Xhat = tfidf[items_ids, :]

    clf.fit(Xhat, rated_scores)

    W[:, user_index] = clf.coef_
    b[0, user_index] = clf.intercept_

### Predicted Scores

In [81]:
Yhat = np.dot(tfidf, W) + b

In [84]:
user_id = 10
items_ids, rated_scores = get_items_rated_by_user(rating_test_pd.values, user_id)
Yhat[user_id, items_ids]
print('Rated movies ids :', items_ids )
print('True ratings     :', rated_scores)
print('Predicted ratings:', Yhat[items_ids, user_id])

Rated movies ids : [  6  15  99 174 284 460 485 487 503 610]
True ratings     : [4 4 5 3 5 3 4 5 5 5]
Predicted ratings: [3.79490523 3.42107302 3.37955573 3.29809875 3.94973869 3.94973869
 3.42107302 2.60504254 3.51887045 3.12940841]


### Evaluate Models

In [88]:
from math import *

def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for user_index in range(n_users):
        user_id = user_index + 1
        ids, scores_truth = get_items_rated_by_user(rates, user_id)
        scores_pred = Yhat[ids, user_index]
        e = scores_truth - scores_pred 
        se += (e*e).sum(axis = 0)
        cnt += e.size 
    return sqrt(se/cnt)

print('RMSE for training:', evaluate(Yhat, rating_base_pd.values, W, b))
print('RMSE for test    :', evaluate(Yhat, rating_test_pd.values, W, b))

RMSE for training: 0.9089804562826721
RMSE for test    : 1.2703282700393037
