In [2]:
import pandas as pd

In [3]:
path1 = r"C:\Users\beall\ColabDocs\Bootcamp\Project Recommender\movies.csv"
movies_df = pd.read_csv(path1)

path2 = r"C:\Users\beall\ColabDocs\Bootcamp\Project Recommender\ratings.csv"
ratings_df = pd.read_csv(path2)

## Exploring data sets

In [3]:
movies_df.shape

(9742, 3)

In [4]:
ratings_df.shape

(100836, 4)

In [5]:
movies_df.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [6]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [55]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [56]:
movies_df.nunique()

movieId    9742
title      9737
genres      951
dtype: int64

In [57]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [58]:
ratings_df.nunique()

userId         610
movieId       9724
rating          10
timestamp    85043
dtype: int64

In [59]:
ratings_df.groupby('userId')['rating'].count()

userId
1       232
2        29
3        39
4       216
5        44
       ... 
606    1115
607     187
608     831
609      37
610    1302
Name: rating, Length: 610, dtype: int64

In [60]:
ratings_df.groupby('movieId')['rating'].count()

movieId
1         215
2         110
3          52
4           7
5          49
         ... 
193581      1
193583      1
193585      1
193587      1
193609      1
Name: rating, Length: 9724, dtype: int64

In [61]:
ratings_df.groupby('movieId')['rating'].mean()

movieId
1         3.920930
2         3.431818
3         3.259615
4         2.357143
5         3.071429
            ...   
193581    4.000000
193583    3.500000
193585    3.500000
193587    3.500000
193609    4.000000
Name: rating, Length: 9724, dtype: float64

# Creating a Recommender based on popularity

### Version with normalisation

We have a df with userId, movieId and rating. We need a new df with the ratings for each movie. For that we extract the necessary info from the original dataset for calculating the rating.

In [14]:
#This should be the format of the new_df to calculate the final rating:
#new_df = pd.DataFrame (columns =
         #              ['movieId',
        #                'rating_mean',
         #               'rating_count'
           #            ]
              #          )

In [70]:
def popularity_rat1(number_rating_m, rating_m):
    #Creating the new df for our calculation
    new_df = ratings_df.groupby('movieId').agg({'rating':['mean', 'count']})
    # We change the name of the new columns
    new_df.columns = ['rating_mean', 'rating_count']
    
    # We create additional columns with normalised data
    new_df['rating_mean_normalised'] = (new_df['rating_mean'] - new_df['rating_mean'].min()) / (new_df['rating_mean'].max() - new_df['rating_mean'].min())
    new_df['rating_count_normalised'] = (new_df['rating_count'] - new_df['rating_count'].min()) / (new_df['rating_count'].max() - new_df['rating_count'].min())
    
    #We create different rates for the variables that are going to be the base of the recommendation 
    weight_rating_count = 0.3
    weight_average_rating = 0.7
    
    #We create the function to calculate the final rating
    new_df['popularity_score'] = (
        weight_rating_count * new_df['rating_mean'] +
        weight_average_rating * new_df['rating_count']
    )
        
     #Reorganising the table
    new_df = new_df.sort_values('popularity_score',ascending=False)
    new_df = new_df.reset_index()
    new_df = new_df.merge(movies_df[['title','genres','movieId']],how='left',on='movieId').drop_duplicates()
        
    return new_df[['title','genres','popularity_score']].head(5)

result = popularity_rat1('rating_count', 'rating_mean')
result   

Unnamed: 0,title,genres,popularity_score
0,Forrest Gump (1994),Comedy|Drama|Romance|War,231.54924
1,"Shawshank Redemption, The (1994)",Crime|Drama,223.228707
2,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,216.159121
3,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,196.548387
4,"Matrix, The (1999)",Action|Sci-Fi|Thriller,195.857734


#### Notes on popularity_rat1:

In [15]:
# We have to make the change the name of the new columns
      #new_df.columns = ['rating_mean', 'rating_count']
#because the df would look like this:    
new_df = ratings_df.groupby('movieId').agg({'rating':['mean', 'count']})
new_df.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,mean,count
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,3.92093,215
2,3.431818,110
3,3.259615,52
4,2.357143,7
5,3.071429,49


### Version without normalisation and a different formula for the weights

In [33]:
# Based on these columns we create a score for recommending. Formula:
# Weighted Rating (WR) = (v/(v+m)R)+(m/(v+m)C)   
# where,  
  
# v is the number of votes for the movie    
# m is the minimum votes required to be listed in the chart   
# R is the average rating of the movie   
# C is the mean vote across the whole   

In [34]:
# v = rating_count --> normalised
# m = rating_count >= q75
# R = rating_mean --> normalised
# C = ???

In [15]:
def popularity_rat2(number_rating_m, rating_m):
    new_df = ratings_df.groupby('movieId').agg({'rating':['mean', 'count']})
    new_df.columns = ['rating_mean', 'rating_count']
    
    #Creating the variables for Weighted Rating (WR) = (v/(v+m)R)+(m/(v+m)C) 
    m = new_df['rating_count'].quantile(0.75)
    C = new_df['rating_mean'].mean()
    
    v = new_df[number_rating_m]
    R = new_df[rating_m]
    
    #Creating new columns for the WR
    new_df['v/(v+m)R'] = (v / (v + m)* R)
    new_df['(m/(v+m)C)'] = (m / (v + m)* C)
    
    new_df['popularity_score'] = new_df['v/(v+m)R'] + new_df['(m/(v+m)C)']
    
    #Reorganising the table
    new_df = new_df.sort_values('popularity_score',ascending=False)
    new_df = new_df.reset_index()
    new_df = new_df.merge(movies_df[['title','genres','movieId']],how='left',on='movieId').drop_duplicates()
    
    return new_df[['title', 'movieId','genres','popularity_score']].head(5)

result1 = popularity_rat2('rating_count', 'rating_mean')
result1   

Unnamed: 0,title,movieId,genres,popularity_score
0,"Shawshank Redemption, The (1994)",318,Crime|Drama,4.396816
1,"Godfather, The (1972)",858,Crime|Drama,4.243095
2,Fight Club (1999),2959,Action|Crime|Drama|Thriller,4.232872
3,Star Wars: Episode IV - A New Hope (1977),260,Action|Adventure|Sci-Fi,4.197546
4,"Usual Suspects, The (1995)",50,Crime|Mystery|Thriller,4.196535
