In [10]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Define a class of content-based recommendation system for movies
class CBR:
    def __init__(self, filepath):
        self.filepath = filepath
        self.ratings_df, self.moviesWithGenres_df = self.preprocess_data(self.filepath)
        self.train_data, self.test_data = self.split_data()
        self.recommendations = self.recommend()

    # load ratings and movies from filepath
    def preprocess_data(self, filepath):
        movies_df = pd.read_csv(filepath + 'movies.csv')


        movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
        #Removing the parentheses
        movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
        #Removing the years from the 'title' column
        movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
        #Applying the strip function to get rid of any ending whitespace characters that may have appeared
        movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
        movies_df['genres'] = movies_df.genres.str.split('|')

        # Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
        moviesWithGenres_df = movies_df.copy()

        # For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
        for index, row in movies_df.iterrows():
            for genre in row['genres']:
                moviesWithGenres_df.at[index, genre] = 1
        # Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
        moviesWithGenres_df = moviesWithGenres_df.fillna(0)


        ratings_df = pd.read_csv(filepath + 'ratings.csv')
        ratings_df = ratings_df.drop('timestamp', 1)

        return ratings_df, moviesWithGenres_df


    # split ratings into train and test datasets
    def split_data(self, test_size=0.2):
        from sklearn.model_selection import train_test_split
        train_data, test_data = train_test_split(self.ratings_df, test_size=test_size)
        return train_data, test_data


    # calculate the users likeness to each genre


    # making recommendations based on users' given ratings in test datasets
    def recommend(self):
        userIds = self.train_data['userId'].unique()
        recommendations = {}
        for i in userIds:
            inputData = self.train_data[self.train_data['userId'] == i]
            userMovies = self.moviesWithGenres_df[self.moviesWithGenres_df['movieId'].isin(inputData['movieId'].tolist())]
            #Resetting the index to avoid future issues
            userMovies = userMovies.reset_index(drop=True)
            #Dropping unnecessary issues due to save memory and to avoid issues
            userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
            #Dot produt to get weights
            userProfile = userGenreTable.transpose().dot(inputData.reset_index(drop=True)['rating'])
            #Now let's get the genres of every movie in our original dataframe
            genreTable = self.moviesWithGenres_df.set_index(self.moviesWithGenres_df['movieId'])
            #And drop the unnecessary information
            genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
            #Multiply the genres by the weights and then take the weighted average
            recommendationTable_df = (((genreTable*userProfile).sum(axis=1))/(userProfile.sum())) * 5
            recommendations[i] = recommendationTable_df.to_dict()
        return recommendations

    # calculate the precision
    def precision(self, nitems):
        hit = 0
        all = 0
        for i in self.train_data['userId'].unique():
            for j, score in sorted(self.recommendations[i].items(), key=lambda x:x[1], reverse=True)[:nitems]:
                if j in self.test_data[self.test_data['userId'] == i]['movieId'].tolist():
                    hit += 1
                all += 1
        return hit/all

In [16]:
cbr = CBR('ml-latest-small/')

In [17]:
cbr.precision(nitems=10)

0.008852459016393442