In [1]:
import pandas as pd
import numpy as np
import re
import sklearn.metrics.pairwise as pw
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

### Loading data

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

#Importing movies rated by my friends
movies_rated = pd.read_excel("bbianchi.xlsx",index_col='Movies')


## Checking for duplicates

In [4]:
# Checking for duplicated movieIDs
display(movies['movieId'].value_counts().sort_values(ascending=False).head()) 
# Checking for duplicated titles
display(movies['title'].value_counts().sort_values(ascending=False).head(10) > 1) 

83969     1
101577    1
26629     1
45062     1
79879     1
Name: movieId, dtype: int64

Emma (1996)                                 True
War of the Worlds (2005)                    True
Eros (2004)                                 True
Saturn 3 (1980)                             True
Confessions of a Dangerous Mind (2002)      True
The Girls (1961)                           False
Last Shift (2014)                          False
Robin Williams: Live on Broadway (2002)    False
Innocent Man, An (1989)                    False
Honey, We Shrunk Ourselves (1997)          False
Name: title, dtype: bool

## Investigating duplicated movieId

In [5]:
# Extracting duplicated movie ids
duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values
#Duplicated titles
duplicate_movies = duplicate_movies[['movieId','title']]
# Checking the id with most reviews
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts())
review_count.reset_index(inplace=True)
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')
display(duplicated_df)
## Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])
duplicated_ids = duplicated_df.drop_duplicates(subset ="title", 
                     keep = 'last', inplace = False)['movieId']

Unnamed: 0,movieId,title,count
0,838,Emma (1996),30
1,2851,Saturn 3 (1980),4
2,6003,Confessions of a Dangerous Mind (2002),15
3,26958,Emma (1996),1
4,32600,Eros (2004),1
5,34048,War of the Worlds (2005),50
6,64997,War of the Worlds (2005),2
7,144606,Confessions of a Dangerous Mind (2002),1
8,147002,Eros (2004),1
9,168358,Saturn 3 (1980),1


In [6]:
# Removing duplicated ids with low review count from movie database
movies = movies.loc[~movies['movieId'].isin(duplicated_ids)]
# Removing duplicated ids with low review count from rating database
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]

## Creating new columns

In [7]:
#creating list with unique genres
genres = list(set('|'.join(list(movies["genres"].unique())).split('|')))
genres.remove('(no genres listed)')

#Creating dummy columns for each genre
for genre in genres:
    movies[genre] = movies['genres'].map(lambda val: 1 if genre in val else 0)

In [8]:
#Creating colum with film year
movies['year'] = movies['title'].map(lambda val: int(re.search('\(([0-9]{4})\)',val).group(1)) 
                                     if re.search('\(([0-9]{4})\)',val)!= None 
                                     else 0)   
# Film Decade
for decade in range(1930,2020,10):
    movies['decade_'+str(decade)] = np.where((movies['year'] < decade+10) & (movies['year'] >= decade) ,1,0)
#     print('column created','decade_' + str(decade))
    
movies['decade_none'] = np.where(movies['year'] == 0 ,1,0)
movies['decade_other'] = np.where((movies['year'] != 0) & (movies['year'] <1930) ,1,0)

# Dropping columns\rows

In [9]:
#Droping genres
movies.drop('genres', axis=1,inplace= True)  
ratings.drop('timestamp', axis=1,inplace= True) 
movies_rated.dropna(axis=0, inplace=True)

## Joining Data Frames

In [10]:
df = pd.merge(ratings, movies, on='movieId')
print(df.shape)
# df.head(2)

(100830, 35)


In [11]:
## Checking a sample of how people rate movies
# print("Number of users who provided ratings:", len(df['userId'].unique()))
# print(df.groupby(["userId"])["rating"].agg([max,min,np.mean,np.median,len])[0:20])

## Item-based collaborative recommender

In [12]:
def item_based_recom(input_dataframe,input_film_name):    
    pivot_item_based = pd.pivot_table(input_dataframe,
                                      index='title',
                                      columns=['userId'], values='rating')  
    sparse_pivot = sparse.csr_matrix(pivot_item_based.fillna(0))
    recommender = pw.cosine_similarity(sparse_pivot)
    recommender_df = pd.DataFrame(recommender, 
                                  columns=pivot_item_based.index,
                                  index=pivot_item_based.index)
    ## Item Rating Based Cosine Similarity
    cosine_df = pd.DataFrame(recommender_df[film_name].sort_values(ascending=False))
    cosine_df.reset_index(level=0, inplace=True)
    cosine_df.columns = ['title','cosine_sim']
    return cosine_df

## Item and Genre-based recommender 

In [13]:
def item_and_genre_based_recom(cosine_df,movies_df,categories):    
## Item Rating and Gender Based Cosine Similarity
    top_cos_genre = pd.merge(cosine_df, movies, on='title')
    # Creating column with genre cosine similarity
    top_cos_genre['genre_similarity'] = [pairwise_row_diff(top_cos_genre,0,row,categories) 
                                          for row in top_cos_genre.index.values]
    return top_cos_genre[['title','cosine_sim','genre_similarity']]

def pairwise_row_diff(dataframe,row1, row2,column_names):
#     display(dataframe)
     # Creates 2 Matrix to compare cosine similarity
    matrix_row1 = [[dataframe.loc[row1,cat] for cat in column_names]] 
    matrix_row2 = [[dataframe.loc[row2,cat] for cat in column_names]] 
    return round(pw.cosine_similarity(matrix_row1,matrix_row2)[0][0],5)

## User based Recommender


In [14]:
def user_based_recom(input_dataframe,input_user_id):    
    pivot_user_based = pd.pivot_table(input_dataframe, index='title', columns=['userId'], values='rating').T
    sparse_pivot_ub = sparse.csr_matrix(pivot_user_based.fillna(0))
    user_recomm = pw.cosine_similarity(sparse_pivot_ub)
    user_recomm_df = pd.DataFrame(user_recomm,columns=pivot_user_based.index.values,
                 index=pivot_user_based.index.values)
    ## Item Rating Based Cosine Similarity
    usr_cosine_df = pd.DataFrame(user_recomm_df[input_user_id].sort_values(ascending=False))
    usr_cosine_df.reset_index(level=0, inplace=True)
    usr_cosine_df.columns = ['userId','cosine_sim']
    return usr_cosine_df

In [15]:
def insert_ratings(usr_rtg_dict,pivot_user_based):
    """
    Function to insert new row in pivot_user_based with film ratings created by a new user 
    """
    #loop throgh dictionary with ids of people who rated
    for person_id in usr_rtg_dict:
    #Loop through movies that where rated
        for movie in films_rated.get(person_id):
    #Inset ratings in main DF
              pivot_user_based.loc[person_id,movie] = usr_rtg_dict.get(person_id).get(movie)
#               print("Inserted score for id: ", person_id,movie,usr_rtg_dict.get(person_id).get(movie))
    return pivot_user_based

def user_based_recom(input_dataframe,input_films_rated,user_id):    
    """
    Function to create user based recoomendations.
    """
    pivot_user_based = pd.pivot_table(df, index='userId', columns=['title'], values='rating')
    pivot_user_based = insert_ratings(films_rated,pivot_user_based)
    user_sparse_pivot = sparse.csr_matrix(pivot_user_based.fillna(0))
    user_recommender = pw.cosine_similarity(user_sparse_pivot)
    user_recommender_df = pd.DataFrame(user_recommender, columns=pivot_user_based.index.values,index = pivot_user_based.index.values)
    ## Movie User based Cosine Similarity data frame 
    usr_cosine_df = pd.DataFrame(user_recommender_df[user_id].sort_values(ascending=False))
    usr_cosine_df.reset_index(level=0, inplace=True)
    usr_cosine_df.columns = ['title','cosine_sim']
    ## 4 most similar users
    similar_usr = list(usr_cosine_df['title'][1:5].values)
    ## Comparing reviews with similar users
    similar_usr_df = pivot_user_based.T[[user_id] + similar_usr].fillna(0)
    similar_usr_df['mean_rev'] = similar_usr_df[similar_usr].mean(numeric_only=True,axis=1)
    similar_usr_df.sort_values('mean_rev', ascending=False,inplace = True)
#   Check user rated movies vs similar users ratings
#   display(similar_usr_df[similar_usr_df[user_id]!=0])
    return similar_usr_df[similar_usr_df[user_id]==0].rename({611:'Bernardo Bianchi',
                                                              'mean_rev':'Mean_score'},axis=1)

# Final Recommender:

### Setttings:

In [16]:
categories = ['Film-Noir', 'Adventure', 'Children',
           'IMAX', 'Crime', 'Documentary', 'Fantasy', 'Musical', 'Romance',
           'Mystery', 'Thriller', 'Animation', 'Action', 'Comedy', 'War', 'Drama',
           'Western', 'Sci-Fi', 'Horror']

people_who_rated = {611: "Bernardo Bianchi"}
films_rated = movies_rated.to_dict()

# film_name = 'Inception (2010)' 
film_name ='Iron Man 2 (2010)' 
user_id = 611

## Function:

In [17]:
def generate_recomendations(df,film_name,input_films_rated,top_results=5,cat=categories):
    print("Movie Recommender by B.Kurka:")
    print("User name: " + "Favorite Movie:", film_name+'\n\n')
    print("Films you might enjooy based that you watched", film_name)
    ## Item Rating Based Cosine Similarity
    cos_sim = item_based_recom(df,film_name)
    display(cos_sim[1:top_results+1])
    
    print("***********************************************************************************************\n")
    print("Films you might enjooy with similar genre then", film_name) 
    display(item_and_genre_based_recom(item_based_recom(df,film_name),movies,categories)\
            .sort_values('cosine_sim',ascending=False)[top_results:]\
            .sort_values('genre_similarity',ascending=False)[:top_results])
   
    print("***********************************************************************************************\n")
    print("Flims reccomended for you:")
    a = user_based_recom(df,input_films_rated,user_id)[0:25]
    
    display(user_based_recom(df,input_films_rated,user_id)[0:5])
#     display()

    return None
    
generate_recomendations(df,film_name,films_rated,5)

Movie Recommender by B.Kurka:
User name: Favorite Movie: Iron Man 2 (2010)


Films you might enjooy based that you watched Iron Man 2 (2010)


Unnamed: 0,title,cosine_sim
1,X-Men: First Class (2011),0.699524
2,"Avengers, The (2012)",0.695324
3,Iron Man 3 (2013),0.677566
4,Iron Man (2008),0.641986
5,Guardians of the Galaxy (2014),0.638427


***********************************************************************************************

Films you might enjooy with similar genre then Iron Man 2 (2010)


Unnamed: 0,title,cosine_sim,genre_similarity
1763,G.I. Joe: Retaliation (2013),0.175867,1.0
86,"Matrix Revolutions, The (2003)",0.434889,1.0
74,"Matrix Reloaded, The (2003)",0.446645,1.0
67,Spider-Man 3 (2007),0.453103,1.0
667,G.I. Joe: The Rise of Cobra (2009),0.256834,0.89443


***********************************************************************************************

Flims reccomended for you:


userId,Bernardo Bianchi,298,362,495,295,Mean_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Shawshank Redemption, The (1994)",0.0,3.5,4.5,5.0,5.0,4.5
Kill Bill: Vol. 1 (2003),0.0,4.0,4.5,5.0,4.5,4.5
Kill Bill: Vol. 2 (2004),0.0,4.0,4.5,5.0,4.5,4.5
"Usual Suspects, The (1995)",0.0,3.5,3.5,4.0,5.0,4.0
"Matrix, The (1999)",0.0,4.0,4.5,5.0,0.0,3.375
