In [0]:
!pip install fuzzywuzzy
!git clone https://github.com/subha12k/data.git

Collecting fuzzywuzzy
  Downloading https://files.pythonhosted.org/packages/d8/f1/5a267addb30ab7eaa1beab2b9323073815da4551076554ecc890a3595ec9/fuzzywuzzy-0.17.0-py2.py3-none-any.whl
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.17.0
Cloning into 'data'...
remote: Enumerating objects: 7, done.[K
remote: Total 7 (delta 0), reused 0 (delta 0), pack-reused 7[K
Unpacking objects: 100% (7/7), done.


In [0]:
# BLOCK 1 : Import Modules
import os
import time 
t1 = time.time()

# Data Science imports
import numpy as np
import pandas as pd

# Visualization Imports
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Utility Imports
from fuzzywuzzy import fuzz




In [0]:
LARGE_DATASET = False

In [0]:
data_path = os.path.join('/content', 'data')

In [0]:
# BLOCK 2 : Read In Data And Sample A Few Movies

if LARGE_DATASET is False:
    
    movies_filename = os.path.join(data_path, 'movies.csv')
    ratings_filename = os.path.join(data_path, 'ratings.csv')
    
    movies_df = pd.read_csv(movies_filename,
                        usecols=['movieId', 'title'],
                        dtype={'movieId': 'int32', 'title': 'str'})

    ratings_df = pd.read_csv(ratings_filename,
                         usecols=['userId', 'movieId', 'rating'],
                         dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
    
else:   
    movies_filename = './ml-10M100K/movies.dat'
    ratings_filename = './ml-10M100K/ratings.dat'

    movies = pd.read_csv(movies_filename,header=None,  sep = '::')
    movies.columns = ["movieId", "title", "genre"]

    ratings = pd.read_csv(ratings_filename,header=None,  sep = '::')
    ratings.columns = ["userId", "movieId", "rating","dummy"]


    movies = movies[["movieId","title"]]
    ratings_df = ratings[["userId", "movieId", "rating"]]



In [0]:
# BLOCK 3: Write The Rating Matrix Builder And The ALSRecommender Class

def get_rating_matrix(X):
    """Function to generate a ratings matrix and mappings for
    the user and item ids to the row and column indices

    Parameters
    ----------
    X : pandas.DataFrame, shape=(n_ratings,>=3)
        First 3 columns must be in order of user, item, rating.

    Returns
    -------
    rating_matrix : 2d numpy array, shape=(n_users, n_items)
    user_map : pandas Series, shape=(n_users,)
        Mapping from the original user id to an integer in the range [0,n_users)
    item_map : pandas Series, shape=(n_items,)
        Mapping from the original item id to an integer in the range [0,n_items)
    """
    user_col, item_col, rating_col = X.columns[:3]
    rating = X[rating_col]
    user_map = pd.Series(
        index=np.unique(X[user_col]),
        data=np.arange(X[user_col].nunique()),
        name='user_map',
    )
    item_map = pd.Series(
        index=np.unique(X[item_col]),
        data=np.arange(X[item_col].nunique()),
        name='columns_map',
    )
    user_inds = X[user_col].map(user_map)
    item_inds = X[item_col].map(item_map)
    rating_matrix = (
        pd.pivot_table(
            data=X,
            values=rating_col,
            index=user_inds,
            columns=item_inds,
        )
        .fillna(0)
        .values
    )
    return rating_matrix, user_map, item_map

class ALSRecommender():
    """Recommender based on Alternating Least Squares algorithm.
    
    Parameters
    ----------
    k : int, default=5
        Number of latent features
    lmbda : float, default=0.1
        Regularization parameter
    max_epochs : int, default=15
        Max number of iterations to run
    baseline_algo : object
        Object with fit(X) and 
    """
    def __init__(self, k=5, lmbda=0.1, max_epochs=15, baseline_algo=None, error_metric='mae',
                 verbose=True):
        # Force integer in case it comes in as float
        self.k = int(np.round(k))
        self.lmbda = lmbda
        self.max_epochs = max_epochs
        self.baseline_algo = baseline_algo
        self.error_metric = error_metric
        self.verbose = verbose

        self.U = None
        self.I = None
        self.initialized = False

    def _calc_train_error(self, U, I, R, R_selector=None, error_metric='mae'):
        if R_selector is None:
            R_selector = (R > 0)
        R_hat = np.dot(U.T, I)
        if error_metric == 'mae':
            error = np.sum(R_selector * np.abs(R_hat - R)) / np.sum(R_selector)
        else:
            raise ValueError("{} is an unsupported error metric".format(metric))
        return error

    def _fit_init(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X must be a DataFrame")
        X = X.copy()
        user_col, item_col, rating_col = X.columns[:3]
        if self.baseline_algo is None:
            self.train_mean = X[rating_col].mean()
        else:
            self.baseline_algo.fit(X)
        self.R, self.user_map, self.item_map = get_rating_matrix(X)
        n_users, n_items = self.R.shape
        self.U = 3 * np.random.rand(self.k, n_users)
        self.I = 3 * np.random.rand(self.k, n_items)
        self.I[0, :] = self.R[self.R != 0].mean(axis=0) # Avg. rating for each movie
        self.E = np.eye(self.k) # (k x k)-dimensional idendity matrix
        self.epoch = 0
        self.train_errors = []
        self.initialized = True

    def fit(self, X, n_epochs=None):
        """Fit model to training data X. If at least one iteration has already been run,
        then the model will continue from its most recent state.

        Parameters
        ----------
        X : pandas DataFrame, shape=(n_ratings, >=3)
            First 3 columns must correspond to user, item, and rating in that order
        n_epochs : int, optional
            Number of iterations to run. If not provided, will run for self.max_epochs

        Returns
        -------
        self
            This allows chaining like `ALSRecommender().fit(X_train).predict(X_test)`
        """
        # Allow continuation from previous state if n_epochs is given. Otherwise start from scratch.
        if n_epochs is None:
            self.initialized = False
        if not self.initialized:
            self._fit_init(X)

        epoch_0 = self.epoch
        if n_epochs is None:
            n_epochs = self.max_epochs - epoch_0

        n_users, n_items = self.R.shape

        # Run n_epochs iterations
        for i_epoch in range(n_epochs):
            if self.epoch >= self.max_epochs:
                print("max_epochs = {}".format(self.max_epochs))
                break
            # Fix I and estimate U
            for i, Ri in enumerate(self.R):
                nui = np.count_nonzero(Ri) # Number of items user i has rated
                if (nui == 0): nui = 1 # Be aware of zero counts!
                # Get array of nonzero indices in row Ii
                Ri_nonzero_selector = np.nonzero(Ri)[0]
                # Select subset of I associated with movies reviewed by user i
                I_Ri = self.I[:, Ri_nonzero_selector]
                # Select subset of row R_i associated with movies reviewed by user i
                Ri_nonzero = self.R[i, Ri_nonzero_selector]
                Ai = np.dot(I_Ri, I_Ri.T) + self.lmbda * nui * self.E
                Vi = np.dot(I_Ri, Ri_nonzero.T)
                self.U[:, i] = np.linalg.solve(Ai, Vi)
            # Fix U and estimate I
            for j, Rj in enumerate(self.R.T):
                nmj = np.count_nonzero(Rj) # Number of users that rated item j
                if (nmj == 0): nmj = 1 # Be aware of zero counts!
                # Get array of nonzero indices in row Ij
                Rj_nonzero_selector = np.nonzero(Rj)[0]
                # Select subset of P associated with users who reviewed movie j
                U_Rj = self.U[:, Rj_nonzero_selector]
                # Select subset of column R_j associated with users who reviewed movie j
                Rj_nonzero = self.R[Rj_nonzero_selector, j]
                Aj = np.dot(U_Rj, U_Rj.T) + self.lmbda * nmj * self.E
                Vj = np.dot(U_Rj, Rj_nonzero)
                self.I[:, j] = np.linalg.solve(Aj, Vj)
            error = self._calc_train_error(self.U, self.I, self.R)
            self.train_errors.append(error)
            if self.verbose:
                print("[Epoch {}/{}] train error: {}".format(self.epoch, self.max_epochs, error))
            self.epoch += 1
        return self

    def predict(self, X):
        """Generate predictions for user/item pairs
        
        Parameters
        ----------
        X : pandas dataframe, shape = (n_pairs, 2)
            User, item dataframe
            
        Returns
        -------
        rating_pred : 1d numpy array, shape = (n_pairs,)
            Array of rating predictions for each user/item pair
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X must be a DataFrame")
        X = X.copy()
        user_col, item_col = X.columns[:2]
        
        if self.baseline_algo is None:
            X['rating_baseline'] = self.train_mean
        else:
            X['rating_baseline'] = self.baseline_algo.predict(X)
        X['rating'] = 0
        known_user_and_item_mask = (
            X[user_col].isin(self.user_map.index) & X[item_col].isin(self.item_map.index)
        )
        X_known, X_unknown = X[known_user_and_item_mask], X[~known_user_and_item_mask]
        user_inds = X_known[user_col].map(self.user_map)
        item_inds = X_known[item_col].map(self.item_map)
        rating_pred = np.array([
            np.sum(self.U[:, u_ind] * self.I[:, i_ind])
            for u_ind, i_ind in zip(user_inds, item_inds)
        ])
        X.loc[known_user_and_item_mask, 'rating'] = rating_pred
     
        min_rating = np.min(self.R[np.nonzero(self.R)])
        max_rating = np.max(self.R)
        X.loc[X['rating'] < min_rating, 'rating'] = min_rating
        X.loc[X['rating'] > max_rating, 'rating'] = max_rating
        return X['rating'].values

In [0]:
# BLOCK 4: Define The Make Recommender Function

new_mat = ratings_df.pivot(index='movieId', columns='userId', values='rating').fillna(0)

movie_to_idx = { movie: i for i,movie in 
    enumerate(list(movies_df.set_index('movieId').loc[new_mat.index].title))}

def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. 
    
    Parameters
    ----------    
       mapper:  dict, map movie title name to index of the movie in data
    fav_movie:  str, movie name
      verbose:  bool, print log if True

       Return:  index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

def get_movieId(movies_df, fav_movie_list):
    """
    return movieId(s) of user's favorite movies

    """
    movieId_list = []
    
    for movie in fav_movie_list:
        movieId = fuzzy_matching(movie_to_idx, movie)
        movieId_list.append(movieId)
        
    return(movieId_list)
        
def add_new_user_to_data(train_df, movieId_list):
    
    new_id = ratings_df["userId"].max()+1
            
    max_rating = ratings_df["rating"].max()
    
    user_rows = [[new_id, movieId, max_rating] for movieId in movieId_list]
    
    return(train_df.append((pd.DataFrame(user_rows, columns=['user','item','rating']))))
    
def recommend(model, train_df, movie_list, movies_df, pretrained=False, k=3):
    
    train_df = train_df.iloc[:, :3].copy()
    train_df.columns = ['user', 'item', 'rating']
    
    movieId_list = get_movieId(movies_df, movie_list)
    train_df = add_new_user_to_data(train_df, movieId_list)
  
    user = train_df["user"].max()
    
    if not pretrained:
        model.fit(train_df)
        
    train_df['pred'] = model.predict(train_df)
    train_df = train_df.sort_values('pred', ascending=False)
    recos = train_df[['item','pred']].drop_duplicates(subset=None, keep='first', inplace=False)
    movies, preds = recos[['item', 'pred']].values[:10, :].T
    
    movies, preds = list(movies), list(preds)
    
    index = 0
    for item in movies:
        if item in movie_list:
            movies.remove(item)
            preds.remove(preds[movies.index(item)])
            movies.append(recos[['item']].values[10+index, :].T)
            preds.append(recos[['pred']].values[10+index, :].T)
            index+=1

    
    return movies, preds

In [0]:
# BLOCK 5: Create The Dataset Splits

from sklearn.model_selection import StratifiedKFold

n_splits = 2
skf = StratifiedKFold(n_splits=n_splits, random_state=0)
splits = [
    (train_inds, test_inds)
    for train_inds, test_inds in skf.split(ratings_df, ratings_df['userId'])
]

train_inds, test_inds = splits[0]
train_df, test_df = ratings_df.iloc[train_inds], ratings_df.iloc[test_inds]
     



In [0]:
# BLOCK 6: Train The Model 
model = ALSRecommender(k=20, lmbda=0.1, max_epochs=15, baseline_algo=None, verbose=False)
movie_list = ["Matrix The"] 
movies, pred = recommend(model, train_df, movie_list, movies_df)

Found possible matches in our database: ['Matrix, The (1999)', 'Animatrix, The (2003)']



In [0]:
# BLOCK 7: Let's Recommend Movies

for i,idx in enumerate(movies):
        print('{0}: {1}'.format(i+1, (movies_df[movies_df.movieId==idx].title).values))

1: ['Forrest Gump (1994)']
2: ['Finding Nemo (2003)']
3: ['Dark Knight, The (2008)']
4: ['Matrix, The (1999)']
5: ['Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)']
6: ['Fight Club (1999)']
7: ['Shawshank Redemption, The (1994)']
8: ['Love Actually (2003)']
9: ["Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)"]
10: ["Schindler's List (1993)"]


In [0]:
print(f'\n Total time taken for the Recommender System to run is {time.time()-t1}')


 Total time taken for the Recommender System to run is 124.08367276191711
