In [88]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Collaborative filtering

The dataset is read and a SVD algorithm is applied to the ratings data. Predictions can be obtained using: svd.predict()

In [89]:
reader = Reader()
ratings = pd.read_csv('~/Downloads/ratings_small.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe89a6c91c0>

# Content based filtering

Dataset is loaded and NaN values are filled with a blank space

In [4]:
movies=pd.read_csv('~/Downloads/tmdb_5000_movies.csv')
movies=movies.fillna('')

A function for a simple data cleaning is done. Text based attributes will be converted to lowercase.

In [5]:
def clean_data(x):
        return str.lower()

The data cleaning is applied to sokme selected features that will be taken into account in the content based model.

In [None]:
features=['id' ,'title','overview']
movies_reduced=movies[features]
features.remove('id')
for feature in features:
    movies_reduced[feature] = movies_reduced[feature].apply(clean_data)

A function for creating a soup of the features used for the model is defined.

In [7]:
def create_soup(row):
    return row['title']+ ' ' + row['overview']

The soup is incorporated as a new column in the dataframe.

In [None]:
movies_reduced['soup'] = movies_reduced.apply(create_soup, axis=1)

In order to calculate distances between text attributes, a Tf/Idf vectorizer is used, discarding stop words of the English language.
Then distance is calculated using the cosine ditance between the vectorized attribute.

In [90]:
tfidf = TfidfVectorizer(stop_words='english')
count_matrix = tfidf.fit_transform(movies_reduced['soup'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
movies_reduced=movies_reduced.reset_index()

A mapping between title and index is defined for convenience

In [None]:
indices = pd.Series(movies_reduced.index, index=movies_reduced['title'])

A function for obtaining the top 10 similar movies to a related title is defined. This is obtained by getting the closest movies in terms of cosine distance, using the cosine matrix defined before.

In [None]:
def get_content_based_recommendations(title):
    title=title.lower()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    top_10_recommendations =  movies_reduced.iloc[movie_indices]
    return top_10_recommendations

# Hybrid recommendation system

A function for incorporating both models is defined, the idea es getting the top 10 similar movies using the content based perspective, and returning the results sorted by the predicted rating obtained with collaborative filtering

In [85]:
def get_hybrid_recommendations(title):
    top_10_recommendations = get_content_based_recommendations(title, cosine_sim)
    top_10_recommendations['Estimated Rating'] = 0
    for index, movie in top_10_recommendations.iterrows():
        movie['Estimated Rating'] =  svd.predict(1, movie['id'], 3).est
        top_10_recommendations.loc[index] = movie
    top_10_recommendations = top_10_recommendations.sort_values('Estimated Rating', ascending = False) 
    return top_10_recommendations
    

With this function you can test recommendations

In [None]:
get_hybrid_recommendations('john carter')