# Recommendation Engines

+ Implicit Feedback
+ Explicit Feedback
+ Users should have given multiple ratings and service should have received multiple ratins from many different customers

#### Non-personalized recommendations based on number of times interacted and average ratings

In [None]:
book_df['book'].value_counts()
print(book_df.value_counts().index) # Get the names of the books

In [None]:
# Create a list of only movies appearing > 50 times in the dataset
movie_popularity = user_ratings_df["title"].value_counts()
popular_movies = movie_popularity[movie_popularity > 50].index

# Use this popular_movies list to filter the original DataFrame
popular_movies_rankings =  user_ratings_df[user_ratings_df["title"].isin(popular_movies)]

# Find the average rating given to these frequently watched films
popular_movies_average_rankings = popular_movies_rankings[["title", "rating"]].groupby('title').mean()
print(popular_movies_average_rankings.sort_values(by="rating", ascending=False).head())

#### Non-personalized recommendations based on commonly occuring items

In [None]:
# permutationss (list, length_of_permutations) Generates iterable objects containing all permultations
# list converts this object to a usable list
#pd.DataFrame converts the list to a DataFrame

from itertools import permutations

def create_pairs(x):
    pairs = pd.DataFrame(list(permutations(x.values, 2)), 
                         columns = ['book_a','book_b'])
    return pairs    

In [None]:
# Applying the function to the data
book_pairs = book_df.groupby('userid')['book_title'].apply(create_pairs)
# Drop the index
book_pairs = book_pairs.reset_index(drop = True)

In [None]:
# counting the pairings
pair_counts = book_pairs.groupby['book_a','book_b'].size()

# convert it into a dataframe
pair_counts_df = pair_counts.to_frame(name = 'size').reset_index()

# sort the values
pair_counts_sorted = pair_counts_df.sort_values('size', ascending = False)

# Filter for a book
lord_of_rings = pair_counts_sorted[pair_counts_sorted['book_a'] == 'Lord of the Rings']

In [None]:
import matplotlib.pyplot as plt
lord_of_rings.plot.bar(x = 'book_b')
plt.show()

#### Content Filtering

+ As the recommendations are based on the item attributes rather than user feedback, recommendations can be made on never-before-purchased products
+ The desired outcome is a row per movie with each column indicating whether a attribute applies to the item

In [None]:
# Select only the rows with values in the name column equal to Toy Story
toy_story_genres = movie_genre_df[movie_genre_df['name'] == "Toy Story"]

# Inspect the subset
print(toy_story_genres)

# Select only the rows with values in the name column equal to Toy Story
toy_story_genres = movie_genre_df[movie_genre_df['name'] == 'Toy Story']

# Create cross-tabulated DataFrame from name and genre_list columns
movie_cross_table = pd.crosstab(movie_genre_df['name'], movie_genre_df['genre_list'])

# Select only the rows with Toy Story as the index
toy_story_genres_ct = movie_cross_table[movie_cross_table.index == 'Toy Story']
print(toy_story_genres_ct)

#### Jaccard Similarity
+ The number of attributes that two items have in common (A intersection B) / The total number of their combined attributes (A U B)
+ This values will be between 0 and 1. Higher the intersection, higher the score

In [None]:
from sklearn.metrics import jaccard_score
hobbit_row = book_genre.df.loc['The Hobbit']
GOT_row = book_genre_df.loc['A Game of Thrones']
print(jaccard_score(hobbit_row, GOT_row))

In [None]:
from scipy.spatial.distance import pdist, squareform
jaccard_distances = pdist(book_genre_df.values, metric = 'jaccard') # Create 1D array
square_jaccard_distances = squareform(jaccard_distances)

# As we want similarity we need to separate it from 1
jaccard_similarity_array = 1 - square_jaccard_distances

distance_df = pd.DataFrame(jaccard_similarity_array, 
                          index = genres_array_df['Book'],
                          columns = genres_array_df['Book'])

print(distance_df['The Hobbit']['A Game of Thrones'])
print(distance_df['The Hobbit'].sort_values(ascending = False))

In [None]:
# Import numpy and the distance metric
import numpy as np
from sklearn.metrics import jaccard_score

# Extract just the rows containing GoldenEye and Toy Story
goldeneye_values = movie_cross_table.loc['GoldenEye'].values
toy_story_values = movie_cross_table.loc['Toy Story'].values

# Find the similarity between GoldenEye and Toy Story
print(jaccard_score(goldeneye_values, toy_story_values))

# Repeat for GoldenEye and Skyfall
skyfall_values = movie_cross_table.loc['Skyfall'].values
print(jaccard_score(goldeneye_values, skyfall_values))

In [None]:
# Import functions from scipy
from scipy.spatial.distance import pdist, squareform

# Calculate all pairwise distances
jaccard_distances = pdist(movie_cross_table.values, metric='jaccard')

# Convert the distances to a square matrix
jaccard_similarity_array = 1 - squareform(jaccard_distances)

# Wrap the array in a pandas DataFrame
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index=movie_cross_table.index, columns=movie_cross_table.index)

# Print the top 5 rows of the DataFrame
print(jaccard_similarity_df.head())

##### Creating features from Text for content Recommendation using tf-idf and cosine similarity

In [None]:
from skelearn.feature_extraction.text import TfidfVectorizer
tfidfvec = TfidfVectorizer(min_df = 2, max_df = 0.7) # features occured in atleast two documents, words occuring in more than 70% of the documents will be excluded
vectorized_data = tfidfvec.fit_transform(book_summary_df['Descriptions'])
print(tfidfvec.get_feature_names) # prints out the feature names that were generated
print(vectorized_data.to_array()) #Generates a row for each book and a column for each feature

In [None]:
# converting to array to dataframe
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns = tfidfvec.get_feature_names())
tfidf_df.index = book_summary_df['Book']

cosine similarity
cos(theta) = A.B / ||A||.||B|| # Intutively, the angle between the documents in high dimensional space

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Finding similarity between all items
cosine_similarity_array = cosine_similarity(tfidf_summary_df)

# Finding similarity between two items
cosine_similarity(tfidf_df.loc['The Hobbit'].values.reshape(1,-1),
                 tfidf_df.loc['Macbeth'].values.reshape(1, -1))

In [None]:
# Import cosine_similarity measure
from sklearn.metrics.pairwise import cosine_similarity

# Create the array of cosine similarity values
cosine_similarity_array = cosine_similarity(tfidf_summary_df)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_summary_df.index, columns=tfidf_summary_df.index)

# Print the top 5 rows of the DataFrame
print(cosine_similarity_df.head())

In [None]:
# Wrap the preloaded array in a DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_summary_df.index, columns=tfidf_summary_df.index)

# Find the values for the movie Thor
cosine_similarity_series = cosine_similarity_df.loc['Rio']

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending = False)

# Print the results
print(ordered_similarities)

#### Building user profiles based on the books ready by the user

In [None]:
list_of_books_read = ['The Hobbit', 'Foundation', 'Nudge']
user_books = tfidf_summary_df.reindex(list_of_books_read) #reindex will subset the list of books from index
user_prof = user_movies.mean() # mean of all the features for a particular user
user_prof.values.reshape(1,-1) #Reshape the user_profile

In [None]:
# Finding recommendations for a user
non_user_movies = tfidf_summary_df.drop(list_of_movies_seen, axis = 0)

# Finding the cosine similarity
user_prof_similarities = cosine_similarity(user_prof.values.reshape(1, -1), non_user_movies)

user_prof_similarities_df = pd.DataFrame(user_prof_similarities.T,
                                        index = tfidf_summary_df.index,
                                        columns = ["similarity_score"])

#### Collaborative Filtering

In [None]:
# This is based on user ratings
user_ratings_pivot = user_ratings.pivot(index = 'User',
                                       columns = 'Book',
                                       values = 'Rating')

In [None]:
# Normalize the data by averaging the rating given by user, subtracted from the ratings for filling NA values
# Get the average rating for each user 
avg_ratings = user_ratings_table.mean(axis=1)

# Center each users ratings around 0
user_ratings_table_centered = user_ratings_table.sub(avg_ratings, axis=0)

# Fill in the missing data with 0s
user_ratings_table_normed = user_ratings_table_centered.fillna(0)

#### Item based filtering
+ Similar to the user-based filtering, we can also recommend based on item based
+ If we transpose the data used for user-based, then we get Item based data
+ User-based recommendations compare amongst users, and item-based recommendations compare different items

In [None]:
book_ratings_pivot = user_ratings.pivot.T

In [None]:
# cosine similarities
cosine_similarity(book_ratings_pivot.loc['Lord of the Rings',:].values.reshape(1, -1),
                 book_ratings_pivot.loc['The Hobbit', :].values.reshape(1, -1))

similarities = cosine_similarity(book_ratings_pivot)

cosine_similarity_df = pd.Dataframe(similarities, 
                                   index = book_ratings_pivot.index,
                                   columns = book_ratings_pivot.index)

#### user-user similarity (Predicting the rating by a user)

In [None]:
# Using K-Nearest Neighbors
# Isolate the similarity scores for user_1 and sort
user_similarity_series = user_similarities.loc['user_001']
ordered_similarities = user_similarity_series.sort_values(ascending=False)

# Find the top 10 most similar users
nearest_neighbors = ordered_similarities[1:11].index

# Extract the ratings of the neighbors
neighbor_ratings = user_ratings_table.reindex(nearest_neighbors)

# Calculate the mean rating given by the users nearest neighbors
print(neighbor_ratings['Apollo 13 (1995)'].mean())

# Drop the column you are trying to predict
users_to_ratings.drop("Apollo 13 (1995)", axis=1, inplace=True)

# Get the data for the user you are predicting for
target_user_x = users_to_ratings.loc[["user_001"]]

# Get the target data from user_ratings_table
other_users_y = user_ratings_table['Apollo 13 (1995)']


# target_user_x - Centered ratings that user_001 has given to the movies they have seen.
# other_users_x - Centered ratings for all other users and the movies they have rated excluding the movie Apollo 13.
# other_users_y - Raw ratings that all other users have given the movie Apollo 13.

# Drop the column you are trying to predict
users_to_ratings.drop("Apollo 13 (1995)", axis=1, inplace=True)

# Get the data for the user you are predicting for
target_user_x = users_to_ratings.loc[["user_001"]]

# Get the target data from user_ratings_table
other_users_y = user_ratings_table["Apollo 13 (1995)"]

# Get the data for only those that have seen the movie
other_users_x = users_to_ratings[other_users_y.notnull()]

# Remove those that have not seen the movie from the target
other_users_y.dropna(inplace=True)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Instantiate the user KNN model
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=10)

# Fit the model and predict the target user
user_knn.fit(other_users_x, other_users_y)
user_user_pred = user_knn.predict(target_user_x)

print(user_user_pred)

In [None]:
Item-based or User-based

Item based
+ Item-based recommendations are more consistent over time
+ Easier to explain
+ It can be pre-calculated

Cons:
+ It is very obivious recommendations

User-based
+ Can be a lot more interesting suggestions

Cons:
+ Generally beaten by item-based recommendations using standard metrics

Item based for e-commerce stores (conservative)
User based for books, movies (subjective)

In [None]:
Comparing item-based and user-based models. You have now looked at two different KNN approaches. The first was item-item KNN where you use the average of the  most similar movies that a user has rated to suggest a rating for a movie they haven't watched. The other approach was user-user KNN where you use the average of the ratings that the  most similar users gave the movie to suggest what rating the target user would give the movie.

Now, you will compare the two and calculate what rating user_002 would give to Forrest Gump.

The code for the user_rating_predictor model (that predicts based on what similar users gave the movie), and the movie_rating_predictor (that predicts based off of what ratings this user gave to similar movies) has been started for you.

KNeighborsRegressor has been imported for you.

In [None]:
# Instantiate the user KNN model
user_knn = KNeighborsRegressor()

# Fit the model and predict the target user
user_knn.fit(other_users_x, other_users_y)
user_user_pred = user_knn.predict(target_user_x)
print("The user-user model predicts {}".format(user_user_pred))

# Instantiate the user KNN model
movie_knn = KNeighborsRegressor()

# Fit the model on the movie data and predict
movie_knn.fit(other_movies_x, other_movies_y)
item_item_pred = movie_knn.predict(target_movie_x)
print("The item-item model predicts {}".format(item_item_pred))

#### Sparsity Matrix

In [None]:
#KNN will not perform well with sparse data
# Count the occupied cells
sparsity_count = user_ratings_df.isnull().values.sum()

# Count all cells
full_count = user_ratings_df.size

# Find the sparsity of the DataFrame
sparsity = sparsity_count / full_count
print(sparsity)

# Count the occupied cells per column
occupied_count = user_ratings_df.notnull().sum()
print(occupied_count)

In [None]:
# Count the occupied cells per column
occupied_count = user_ratings_df.notnull().sum()

# Sort the resulting series from low to high
sorted_occupied_count = occupied_count.sort_values()

# Plot a histogram of the values in sorted_occupied_count
sorted_occupied_count.hist()
plt.show()

#### Matrix Factorization
+ Factors can be found if there is atleast one value per row and column
+ We can use this factors to get a filled dataframe
+ The depth of the matrix will be equivalent to the number of users and width of the other factor will be equvivalent to items (in case of user-item collaborative filtering)
+ We can decide the number of latent features (How to decide this?)
+ There will be some amount of information loss because of this process

In [None]:
import numpy as np

# Multiply the user and item matrices
predictions_df = np.dot(user_matrix, item_matrix)
# Inspect the recreated DataFrame
print(predictions_df)

# Inspect the original DataFrame and compare
print(original_df)

#### SVD

In [None]:
# Get the average rating for each user 
avg_ratings = user_ratings_df.mean(axis=1)

# Center each user's ratings around 0
user_ratings_centered = user_ratings_df.sub(avg_ratings, axis=1)

# Fill in all missing values with 0s
user_ratings_centered.fillna(0, inplace=True)

# Print the mean of each column
print(user_ratings_centered.mean(axis=1))

user_ratings_centered data you generated in the last exercise into 3 factors: U, sigma, and Vt.
U is a matrix with a row for each user
Vt has a column for each movie
sigma is an array of weights that you will need to convert to a diagonal matrix

In [None]:
# Import the required libraries 
from scipy.sparse.linalg import svds
import numpy as np

# Decompose the matrix
U, sigma, Vt = svds(user_ratings_centered)

# Convert sigma into a diagonal matrix
sigma = np.diag(sigma)
print(sigma)

In [None]:
# Dot product of U and sigma
U_sigma = np.dot(U, sigma)

# Dot product of result and Vt
U_sigma_Vt = np.dot(U_sigma, Vt)

# Add back on the row means contained in avg_ratings
uncentered_ratings = U_sigma_Vt + avg_ratings.values.reshape(-1, 1)

# Create DataFrame of the results
calc_pred_ratings_df = pd.DataFrame(uncentered_ratings, 
                                    index=user_ratings_df.index,
                                    columns=user_ratings_df.columns
                                   )
# Print both the recalculated matrix and the original 
print(calc_pred_ratings_df)
print(original_df)

#### Evaluating the Recommendations

In [None]:
# Extract the ground truth to compare your predictions against
actual_values = act_ratings_df.iloc[:20, :100].values
avg_values = avg_pred_ratings_df.iloc[:20, :100].values
predicted_values = calc_pred_ratings_df.iloc[:20, :100].values

# Create a mask of actual_values to only look at the non-missing values in the ground truth
mask = ~np.isnan(actual_values)

# Print the performance of both predictions and compare
print(mean_squared_error(actual_values[mask], avg_values[mask], squared=False))
print(mean_squared_error(actual_values[mask], predicted_values[mask], squared=False))