# Collaborative filtering

### With test dataset

In [148]:
from IPython.core.display import HTML
from movie_display import movie_display
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [149]:
movies = pd.read_csv('./dataset/test_dataset.csv')

In [150]:
movies.head()

Unnamed: 0,userId,movieId,rating
0,1,1,1
1,1,3,2
2,1,6,1
3,2,3,4
4,2,4,2


In [249]:
user_item_matrix = movies.pivot_table(index='userId', columns='movieId', values='rating')

user_item_matrix.head(20)

movieId,1,2,3,4,5,6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,,2.0,,,1.0
2,,,4.0,2.0,,
3,3.0,5.0,,4.0,4.0,3.0
4,,4.0,1.0,,3.0,
5,,,2.0,5.0,4.0,4.0
6,5.0,,,,2.0,
7,,4.0,3.0,,,
8,,,,4.0,,2.0
9,5.0,,4.0,,,
10,,2.0,3.0,,,


# Similarity Metrics

In [250]:
def calculate_similarity(df, similarity='none'):
    """
    Prepare the datafram for the cosine similarity
    Parameters
    ----------
    df : Pandas Dataframe

    similarity : string
    Returns
    -------
     dataframe : Pandas dataframe
    """
    similarity_matrix = None
    if similarity == 'none':
        similarity_matrix = df.fillna(0)
    if similarity == 'pearson':
        # centered zero matrix
        similarity_matrix = df.subtract(df.mean(axis=1), axis=0).fillna(0)   
    if similarity == 'adjusted':
        # Adjusted cosine similarity
        similarity_matrix = (df - df.mean()).fillna(0)
    
    similarity = cosine_similarity(similarity_matrix)
    # fill on diagonal with 0 
    # https://github.com/sharmin2697/Movie-Recommender-System/blob/main/code/Functions.ipynb
    np.fill_diagonal(similarity, 0)
    # set the columns and index of the initial dataframe otherwise it would messup the indexes later
    return pd.DataFrame(similarity,index=df.index, columns=df.index)
    # return pd.DataFrame(similarity, index = similarity_matrix.index, columns = similarity_matrix.index)

In [251]:
# cosine similarity
cosine_similarity_matrix = calculate_similarity(user_item_matrix)
# calc pearson similarity
pearson_similarity_matrix = calculate_similarity(user_item_matrix, 'pearson')
# calculate adjusted cosine similarity
adjusted_cosine_similarity_matrix = calculate_similarity(user_item_matrix, 'adjusted')

In [252]:
cosine_similarity_matrix

userId,1,2,3,4,5,6,7,8,9,10,11,12
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,0.730297,0.282843,0.160128,0.418167,0.379049,0.489898,0.182574,0.82885,0.679366,0.904534,0.0
2,0.730297,0.0,0.206559,0.175412,0.515339,0.0,0.536656,0.4,0.558744,0.744208,0.660578,0.383482
3,0.282843,0.206559,0.0,0.724657,0.709652,0.493172,0.46188,0.568038,0.270501,0.320256,0.639602,0.693103
4,0.160128,0.175412,0.724657,0.0,0.351541,0.218507,0.745241,0.0,0.122513,0.598321,0.362103,0.403604
5,0.418167,0.515339,0.709652,0.351541,0.0,0.190207,0.153644,0.801638,0.159968,0.213066,0.693451,0.548954
6,0.379049,0.0,0.493172,0.218507,0.190207,0.0,0.0,0.0,0.725018,0.0,0.54858,0.0
7,0.489898,0.536656,0.46188,0.745241,0.153644,0.0,0.0,0.0,0.374817,0.94299,0.467748,0.411597
8,0.182574,0.4,0.568038,0.0,0.801638,0.0,0.0,0.0,0.0,0.0,0.440386,0.766965
9,0.82885,0.558744,0.270501,0.122513,0.159968,0.725018,0.374817,0.0,0.0,0.519778,0.768946,0.0
10,0.679366,0.744208,0.320256,0.598321,0.213066,0.0,0.94299,0.0,0.519778,0.0,0.58037,0.285391


# User-User recommendations

In [None]:
def get_similar_users(user_id, similarity_strategy):
    """"Create a dict with the most similar users and ranks them based on similarity """
    # get the similarity matrix based similarity strategy
    similarity_matrix = pd.DataFrame(cosine_similarity(prepare_matrix_for_cosine(user_item_matrix, similarity_strategy)))
    # drop the user itself
    similar_users = similarity_matrix.loc[user_id]
    # sorting the value descending in order to get the most similar users first
    return similar_users.sort_values(ascending=False).head(3)

In [None]:
get_similar_users(0, 'none')

0     1.000000
10    0.904534
8     0.828850
Name: 0, dtype: float64

In [253]:
def get_recommendation_list(user_id, number_of_recommendations, user_item_matrix, similarity_metric):
    # get the similar users with the selected similarity strategy as pandas dataframe
    similar_users = get_similar_users(user_id, similarity_metric)

    print(similar_users)
    # get pandas dataframe with each users and their movies
    rated_movies_by_user = user_item_matrix.iloc[user_id].dropna()
    
    initial_recommendation = pd.Series()
    # print(len(similar_users))
    for user_id, similarity in similar_users.items():
        # print(user_id, similarity)
        similar_user = user_item_matrix.loc[user_id].dropna()
         
        # for movie_id, rating in similar_user.items():
        #     if movie_id not in rated_movies_by_user:
        #         # print('checking', user_id, similarity, movie_id, rating)
        #         initial_recommendation= pd.concat([initial_recommendation, pd.Series([rating * similarity], index=[movie_id])])
    # sort values descending
    recommendations = initial_recommendation.sort_values(ascending=False).head(number_of_recommendations)
    # return the movies id only
    print(initial_recommendation.head())
    movies = []
    for movie_id, _ in recommendations.items():
        movies.append(movie_id)
    
    return movies

In [166]:
# get_recommendation_list(0, 3, user_item_matrix, 'none')

In [277]:
# function to generate user-based recommendations
def get_user_reccommendations(user_id, number_of_reccommendations, similarity_matrix):
    # get similar users from the similarity matrix and sort them descending and select the first 2 rows from the dataframe
    similar_user_data = similarity_matrix.loc[user_id].sort_values(ascending=False).head(2)
    # drop the unrated movies from the user so that we can check based on this in the for loop
    rated_movies_by_user = user_item_matrix.loc[user_id].dropna()
    print(similar_user_data)
    # print('rated', rated_movies_by_user.to_list())
    recommendations = []
    for user_id, similarity in similar_user_data.items():
        # select the user from the user_item matrix and drop the null values
        similar_user_movies = user_item_matrix.loc[user_id].dropna()
        # print('current user',similar_user_movies.tolist())
        for movie_id, rating in similar_user_movies.items():
            if movie_id not in rated_movies_by_user:
                # print('user_id', user_id, 'movie', movie_id, 'similarity',similarity, 'rating', rating)
                recommendations.append((movie_id, rating * similarity))
  
    # sort list of tuples based on similarity and return it 
    # https://stackoverflow.com/a/3121985
    recommendations.sort(key=lambda tup: tup[1], reverse=True)
    return recommendations[:number_of_reccommendations]

In [278]:
get_user_reccommendations(1, 2, cosine_similarity_matrix)

userId
11    0.904534
9     0.828850
Name: 1, dtype: float64


[(4, 1.8090680674665818), (5, 1.8090680674665818)]

# Item-Item Recommendations

Give N (configurable) recommendations for a given user U (configurable) based on the
movies the user U rated with at least 3.5 stars. Explain your implementation and the
strategy that you use for selecting the final recommendations.
Which means that we have to give the user N number of recommendations for a certain user(selectable) from the movies that he has rated with at least 3.5 stars.

In the UI we would have to 
1. N number of recommendations
2. U which user id
3. Select certain movie which is rated >= 3.5 stars

How to do the recommendations: 
2. find similar items
3. Candidate selection (items you might recommend)
4. Score recommendation candidates
5. Filter candidates (top_n) recommendation